# Glacier mass balance ML models

# Part I: Producing a dataset

First, we import the dependencies and we configure OGGM.

In [1]:
import xarray as xr
import rioxarray
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import oggm
from oggm import cfg, utils, workflow, tasks, graphics
from oggm import entity_task
from oggm.core import gis
from oggm.utils import DEM_SOURCES
from pathlib import Path
import os
import logging

pd.set_option('display.max_columns',None)

cfg.initialize(logging_level='WARNING')
cfg.PARAMS['border'] = 40
cfg.PARAMS['use_multiprocessing'] = True 
# Module logger
log = logging.getLogger('.'.join(__name__.split('.')[:-1]))

2022-06-21 15:53:22: oggm.cfg: Reading default parameters from the OGGM `params.cfg` configuration file.
2022-06-21 15:53:22: oggm.cfg: Multiprocessing switched OFF according to the parameter file.
2022-06-21 15:53:22: oggm.cfg: Multiprocessing: using all available processors (N=64)
2022-06-21 15:53:22: oggm.cfg: Multiprocessing switched ON after user settings.


Choose your OGGM path where you want to store all the data.

In [2]:
parent_path = os.path.dirname(Path().resolve())
workspace_path = os.path.join(parent_path, 'OGGM_data_Finse')
if not os.path.exists(workspace_path):
    os.mkdir(workspace_path)
else:
    cfg.PATHS['working_dir'] = workspace_path

Download all data from glaciers in Scandinavia (RGI region '08').

In [3]:
rgi_region = '08'
rgi_version = '6'
rgi_dir = utils.get_rgi_dir(version=rgi_version)
path = utils.get_rgi_region_file(region=rgi_region, version=rgi_version)
rgidf = gpd.read_file(path)
gdirs = workflow.init_glacier_directories(rgidf, from_prepro_level=3, prepro_border=10)

2022-06-21 15:53:24: oggm.workflow: init_glacier_directories from prepro level 3 on 3417 glaciers.
2022-06-21 15:53:24: oggm.workflow: Execute entity tasks [gdir_from_prepro] on 3417 glaciers
Process ForkPoolWorker-32:
Process ForkPoolWorker-24:
Process ForkPoolWorker-31:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/jovyan/.conda/envs/oggm_env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/jovyan/.conda/envs/oggm_env/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/jovyan/.conda/envs/oggm_env/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-15:
Traceback (most recent call last):
  File "/home/jovyan/.conda/envs/oggm_env/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/jovyan/.conda/envs/oggm_env/lib/pytho

## Get geodetic glacier mass balance data

We get the geodetic MB for all glaciers in Scandinavia from Hugonnet et al. (2021)

In [None]:
mbdf = utils.get_geodetic_mb_dataframe()

In [None]:
mbdf.drop(columns=['area', 'reg', 'is_cor'])

Unnamed: 0_level_0,period,dmdtda,err_dmdtda
rgiid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RGI60-01.00001,2000-01-01_2010-01-01,0.021700,0.430000
RGI60-01.00001,2000-01-01_2020-01-01,-0.012800,0.217600
RGI60-01.00001,2010-01-01_2020-01-01,-0.047200,0.394900
RGI60-01.00002,2000-01-01_2010-01-01,-0.168300,0.279200
RGI60-01.00002,2000-01-01_2020-01-01,-0.229000,0.146000
...,...,...,...
RGI60-19.02751,2000-01-01_2020-01-01,-0.136311,0.295265
RGI60-19.02751,2010-01-01_2020-01-01,-0.119835,0.536447
RGI60-19.02752,2000-01-01_2010-01-01,0.121300,0.542100
RGI60-19.02752,2000-01-01_2020-01-01,-0.038600,0.289700


In [None]:
mbdf.index.names = ['RGI_ID']

## Get glacier climate data

Now we get the climate data from CRU for each glacier. We generate an entity task to run in parallel in OGGM.

In [None]:
@entity_task(log)
def get_climate_data(gdir, period=pd.date_range(start="2000-01-01",end="2020-01-01")):
    """Mandatory docstring
    """
    climate_data = {'PDD':[],
                    'snowfall':[],
                    'rainfall':[],
                    'years':[]
    }
    zmed = float(gdir.read_shapefile('outlines').Zmed.values[0])
    fpath = gdir.get_filepath('climate_historical')
    clim = xr.open_dataset(fpath)
    clim = clim.sel(time=period, method='nearest') # we trim the data for the desired period
                     
    # Temperature
    clim.temp.data = clim.temp.data + 6.0/1000.0*(zmed - clim.ref_hgt) # Super rough temperature lapse rate            
    climate_data['PDD'] = clim.temp.where(clim.temp > 0.0).groupby('time.year').sum().data
    # Snowfall
    climate_data['snowfall'] = clim.prcp.where(clim.temp <= 0.0).groupby('time.year').sum().data
    # Rainfall
    climate_data['rainfall'] = clim.prcp.where(clim.temp > 0.0).groupby('time.year').sum().data
    #Years
    climate_data['years'] = clim.time.dt.year.data
    
    return climate_data

We run the function in parallel as an entity task.

In [None]:
climate_data = workflow.execute_entity_task(get_climate_data, gdirs)

2022-06-21 15:54:55: oggm.workflow: Execute entity tasks [get_climate_data] on 3417 glaciers


In [None]:
fpath = gdir.get_filepath('climate_historical')
clim = xr.open_dataset(fpath)
zmed = float(gdir.read_shapefile('outlines').Zmed.values[0])
period=pd.date_range(start="2000-01-01",end="2020-01-01")

clim = clim.sel(time=period, method='nearest')

In [None]:
clim.temp.data = clim.temp.data + 6.0/1000.0*(zmed - clim.ref_hgt)

In [None]:
clim.temp

In [None]:
get_climate_data(gdir)

## Get glacier topographical data

Now we get the topographical data for all glacier to be used in the training. 

In [None]:
gdir = gdirs[100]

In [None]:
dem_path = gdir.get_filepath('dem')

In [None]:
da = rioxarray.open_rasterio(dem_path)
f, ax = plt.subplots()
da.plot(cmap='terrain', ax=ax);
# Add the outlines
gdir.read_shapefile('outlines').plot(ax=ax, color='none', edgecolor='black');

In [None]:
@entity_task(log)
def get_topo_predictors(gdir):
    """Mandatory docstring
    """
        
    training_data = {'zmed': 0.0,
                 'zmax': 0.0,
                 'zmin': 0.0,
                 'area': 0.0,
                 'slope': 0.0,
                 'lat': 0.0,
                 'icecap': 0.0,
                 'ID': ""
        }
    
    gl_shp = gdir.read_shapefile('outlines')
    
    training_data['zmed'] = float(gl_shp.Zmed.values[0])
    training_data['zmax'] = float(gl_shp.Zmax.values[0])
    training_data['zmin'] = float(gl_shp.Zmin.values[0])
    training_data['area'] = gdir.rgi_area_km2
    training_data['lat'] = gdir.cenlat
    training_data['icecap'] = int(gdir.is_icecap)
    training_data['slope'] = float(gl_shp.Slope.values[0])
    training_data['ID'] = gdir.rgi_id
        
    return training_data

We parallelize this using the function as an entity task in OGGM

In [None]:
topo_dicts = workflow.execute_entity_task(get_topo_predictors, gdirs)

In [None]:
topo_df = pd.DataFrame(topo_dicts)
topo_df.index = topo_df.ID
topo_df.index.name = 'RGI_ID'
topo_df.drop(columns='ID')

In [None]:
training_df.to_csv('training_df.csv')

To avoid computing all topographical predictors each time, just load the previously stored file.

In [None]:
training_df = pd.read_csv('training_df.csv')