Merge pull request #115 from JannisHoch/poly_neighbours

Poly neighbours
JannisHoch · Dec 14, 2020 · d4d9253 · d4d9253
2 parents b5cc738 + 6969a35
commit d4d9253
Show file tree

Hide file tree

Showing 8 changed files with 185 additions and 213 deletions.
diff --git a/copro/conflict.py b/copro/conflict.py
@@ -3,7 +3,7 @@
 import numpy as np
 import os, sys
 
-def conflict_in_year_bool(conflict_gdf, extent_gdf, sim_year): 
+def conflict_in_year_bool(config, conflict_gdf, extent_gdf, sim_year): 
     """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
 
     Args:
@@ -48,7 +48,7 @@ def conflict_in_year_bool(conflict_gdf, extent_gdf, sim_year):
 
     return list_out
 
-def conflict_in_previous_year(conflict_gdf, extent_gdf, sim_year, t_0_flag=None):
+def conflict_in_previous_year(config, conflict_gdf, extent_gdf, sim_year, t_0_flag=None):
     """Creates a list for each timestep with boolean information whether a conflict took place in a polygon at the previous timestep or not.
     If the current time step is the first (t=0), then conflict data of this year is used instead due to the lack of earlier data.
 
@@ -68,7 +68,7 @@ def conflict_in_previous_year(conflict_gdf, extent_gdf, sim_year, t_0_flag=None)
 
     # if it is the first time step (t_0), the data of this year will be used
     if t_0_flag == True:
-        print('... it is the first year, so no conflict for previous year is known')
+        if config.getboolean('general', 'verbose'): print('DEBUG: first year of simulation period -> conflict at t-1 set to conflict at t')
         temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]
     # else, the data from the previous time step (t-1) is used
     elif t_0_flag == None:
@@ -79,17 +79,21 @@ def conflict_in_previous_year(conflict_gdf, extent_gdf, sim_year, t_0_flag=None)
     # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
     data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
 
-    fatalities_per_poly = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+    # determine log-transformed count of unique conflicts per water province
+    # the id column refers to the conflict id, not the water province id!
+    if config.getboolean('general', 'verbose'): print('DEBUG: computing log-transformed count of conflicts at t-1')
+    conflicts_per_poly = np.log(data_merged.id.groupby(data_merged['watprovID']).count().to_frame())
 
     # loop through all regions and check if exists in sub-set
     # if so, this means that there was conflict and thus assign value 1
     list_out = []
     for i in range(len(extent_gdf)):
         i_poly = extent_gdf.iloc[i]['watprovID']
-        if i_poly in fatalities_per_poly.index.values:
-            list_out.append(1)
+        if i_poly in conflicts_per_poly.index.values:
+            val = float(conflicts_per_poly.id.loc[conflicts_per_poly.index == i_poly].values[0])
+            list_out.append(val)
         else:
-            list_out.append(0)
+            list_out.append(float(0.))
 
     if not len(extent_gdf) == len(list_out):
         raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))

diff --git a/copro/data.py b/copro/data.py
@@ -85,7 +85,7 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):
             if key == 'conflict':
 
                 data_series = value
-                data_list = conflict.conflict_in_year_bool(conflict_gdf, polygon_gdf, sim_year)
+                data_list = conflict.conflict_in_year_bool(config, conflict_gdf, polygon_gdf, sim_year)
                 data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                 XY[key] = data_series
 
@@ -94,7 +94,7 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):
                 data_series = value
                 if i==0: t_0_flag = True
                 else: t_0_flag = None
-                data_list = conflict.conflict_in_previous_year(conflict_gdf, polygon_gdf, sim_year, t_0_flag=t_0_flag)
+                data_list = conflict.conflict_in_previous_year(config, conflict_gdf, polygon_gdf, sim_year, t_0_flag=t_0_flag)
                 data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                 XY[key] = data_series
 
@@ -114,7 +114,7 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):
 
             else:
 
-                nc_ds = xr.open_dataset(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', key)))
+                nc_ds = xr.open_dataset(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', key)).rsplit(',')[0])
 
                 if (np.dtype(nc_ds.time) == np.float32) or (np.dtype(nc_ds.time) == np.float64):
                     data_series = value

diff --git a/copro/variables.py b/copro/variables.py
@@ -5,11 +5,12 @@
 import rasterstats as rstats
 import numpy as np
 import os, sys
+import math
 
 import warnings
 warnings.filterwarnings("ignore")
 
-def nc_with_float_timestamp(extent_gdf, config, root_dir, var_name, sim_year, stat_func='mean'):
+def nc_with_float_timestamp(extent_gdf, config, root_dir, var_name, sim_year):
     """This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
     By default, the mean value of all cells within a polygon is computed.
     The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
@@ -29,51 +30,79 @@ def nc_with_float_timestamp(extent_gdf, config, root_dir, var_name, sim_year, st
         root_dir (str): path to location of cfg-file. 
         var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file.
         sim_year (int): year for which data is extracted.
-        stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
 
     Raises:
         ValueError: raised if the extracted variable at a time step does not contain data
 
     Returns:
         list: list containing statistical value per polygon, i.e. with same length as extent_gdf
     """   
-    # get path to netCDF-file.
-    # nc_fo = os.path.join(os.path.abspath(config.get('general', 'input_dir')), 
-    #                      config.get('data', var_name))
 
-    nc_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))
+    data_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name)).rsplit(',')
+
+    if len(data_fo) != 3:
+        raise ValueError('ERROR: not all settings for input data set {} provided - it must contain of path, False/True, and statistical method'.format(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))))
+    else:
+        nc_fo = data_fo[0]
+        ln_flag = bool(data_fo[1])
+        stat_method = str(data_fo[2])
 
     if config.getboolean('general', 'verbose'): print('DEBUG: calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))
 
+    print(nc_fo)
+    print(ln_flag)
+
     # open nc-file with xarray as dataset
     nc_ds = xr.open_dataset(nc_fo)
     # get xarray data-array for specified variable
     nc_var = nc_ds[var_name]
-
+    if ln_flag:
+        nc_var = np.log(nc_var)
+        if config.getboolean('general', 'verbose'): print('DEBUG: log-transform variable {}'.format(var_name))
     # open nc-file with rasterio to get affine information
     affine = rio.open(nc_fo).transform
 
     # get values from data-array for specified year
     nc_arr = nc_var.sel(time=sim_year)
     nc_arr_vals = nc_arr.values
     if nc_arr_vals.size == 0:
-        raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+        raise ValueError('ERROR: the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
 
     # initialize output list
     list_out = []
     # loop through all polygons in geo-dataframe and compute statistics, then append to output file
+    print('DEBUG: computing zonal statistic with method {}'.format(stat_method))
     for i in range(len(extent_gdf)):
+
+        # province i
         prov = extent_gdf.iloc[i]
-        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
-        if (zonal_stats[0][stat_func] == None) and (config.getboolean('general', 'verbose')): 
+
+        # compute zonal stats for this province
+        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_method)
+        val = zonal_stats[0][stat_method]
+
+        # # if specified, log-transform value
+        if ln_flag:
+            if config.getboolean('general', 'verbose'): print('DEBUG: log-transform variable {}'.format(var_name))
+            # works only if zonal stats is not None, i.e. if it's None it stays None
+            if val != None: val = np.log(val)
+
+        # in case log-transformed value results in -inf, replace with None
+        if val == -math.inf:
+            if config.getboolean('general', 'verbose'): print('INFO: set -inf to None')
+            val = None
+
+        # print a warning if result is None
+        if (val == None) and (config.getboolean('general', 'verbose')): 
             print('WARNING: NaN computed!')
-        list_out.append(zonal_stats[0][stat_func])
+
+        list_out.append(val)
 
     if config.getboolean('general', 'verbose'): print('DEBUG: ... done.')
 
     return list_out
 
-def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name, sim_year, stat_func='mean'):
+def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name, sim_year):
     """This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
     By default, the mean value of all cells within a polygon is computed.
     The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
@@ -90,7 +119,6 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name,
         root_dir (str): path to location of cfg-file. 
         var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file.
         sim_year (int): year for which data is extracted.
-        stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
 
     Raises:
         ValueError: raised if specfied year cannot be found in years in nc-file
@@ -99,12 +127,16 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name,
     Returns:
         list: list containing statistical value per polygon, i.e. with same length as extent_gdf
     """   
-    # get path to netCDF-file.
-    # nc_fo = os.path.join(os.path.abspath(config.get('general', 'input_dir')), 
-    #                      config.get('data', var_name))
 
-    nc_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))
-
+    data_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name)).rsplit(',')
+
+    if len(data_fo) != 3:
+        raise ValueError('ERROR: not all settings for input data set {} provided - it must contain of path, False/True, and statistical method'.format(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))))
+    else:
+        nc_fo = data_fo[0]
+        ln_flag = bool(data_fo[1])
+        stat_method = str(data_fo[2])
+
     if config.getboolean('general', 'verbose'): print('DEBUG: calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))
 
     # open nc-file with xarray as dataset
@@ -114,28 +146,48 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name,
     # get years contained in nc-file as integer array to be compatible with sim_year
     years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
     if sim_year not in years:
-        raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
+        raise ValueError('ERROR: the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
 
     # get index which corresponds with sim_year in years in nc-file
     sim_year_idx = int(np.where(years == sim_year)[0])
     # get values from data-array for specified year based on index
     nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
     nc_arr_vals = nc_arr.values
     if nc_arr_vals.size == 0:
-        raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
+        raise ValueError('ERROR: no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
 
     # open nc-file with rasterio to get affine information
     affine = rio.open(nc_fo).transform
 
     # initialize output list
     list_out = []
     # loop through all polygons in geo-dataframe and compute statistics, then append to output file
+    print('DEBUG: computing zonal statistic with method {}'.format(stat_method))
     for i in range(len(extent_gdf)):
+
+        # province i
         prov = extent_gdf.iloc[i]
-        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
-        if (zonal_stats[0][stat_func] == None) and (config.getboolean('general', 'verbose')): 
+
+        # compute zonal stats for this province
+        zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_method)
+        val = zonal_stats[0][stat_method]
+
+        # # if specified, log-transform value
+        if ln_flag:
+            if config.getboolean('general', 'verbose'): print('DEBUG: log-transform variable {}'.format(var_name))
+            # works only if zonal stats is not None, i.e. if it's None it stays None
+            if val != None: val = np.log(val)
+
+        # in case log-transformed value results in -inf, replace with None
+        if val == -math.inf:
+            if config.getboolean('general', 'verbose'): print('INFO: set -inf to None')
+            val = None
+
+        # print a warning if result is None
+        if (val == None) and (config.getboolean('general', 'verbose')): 
             print('WARNING: NaN computed!')
-        list_out.append(zonal_stats[0][stat_func])
+
+        list_out.append(val)
 
     if config.getboolean('general', 'verbose'): print('DEBUG: ... done.')
 

diff --git a/docs/_static/roc_curve.png b/docs/_static/roc_curve.png
diff --git a/example/example_settings.cfg b/example/example_settings.cfg
@@ -39,11 +39,14 @@ zones=BWh,BSh
 code2class=KoeppenGeiger/classification_codes.txt
 
 [data]
-# variable name here needs to be identical with variable name in nc-file
-total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
-precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
-temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
-irr_water_demand=hydro/irrWaterDemand.nc
+# specify the path to the nc-file, whether the variable shall be log-transformed (True, False), and which statistical function should be applied
+# these three settings need to be separated by a comma
+# NOTE: variable name here needs to be identical with variable name in nc-file
+# NOTE: only statistical functions supported by rasterstats are valid
+total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc,True,mean
+precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,True,mean
+temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,False,mean
+irr_water_demand=hydro/irrWaterDemand.nc,False,sum
 
 [machine_learning]
 # choose from: MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

diff --git a/example/example_settings_proj.cfg b/example/example_settings_proj.cfg
@@ -39,11 +39,14 @@ zones=BWh,BSh
 code2class=KoeppenGeiger/classification_codes.txt
 
 [data]
-# variable name here needs to be identical with variable name in nc-file
-total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
-precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
-temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
-irr_water_demand=hydro/irrWaterDemand.nc
+# specify the path to the nc-file, whether the variable shall be log-transformed (True, False), and which statistical function should be applied
+# these three settings need to be separated by a comma
+# NOTE: variable name here needs to be identical with variable name in nc-file
+# NOTE: only statistical functions supported by rasterstats are valid
+total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc,True,mean
+precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,True,mean
+temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,False,mean
+irr_water_demand=hydro/irrWaterDemand.nc,False,sum
 
 [machine_learning]
 # choose from: MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer