Skip to content

Commit

Permalink
Merge pull request #115 from JannisHoch/poly_neighbours
Browse files Browse the repository at this point in the history
Poly neighbours
  • Loading branch information
JannisHoch committed Dec 14, 2020
2 parents b5cc738 + 6969a35 commit d4d9253
Show file tree
Hide file tree
Showing 8 changed files with 185 additions and 213 deletions.
18 changes: 11 additions & 7 deletions copro/conflict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import os, sys

def conflict_in_year_bool(conflict_gdf, extent_gdf, sim_year):
def conflict_in_year_bool(config, conflict_gdf, extent_gdf, sim_year):
"""Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
Args:
Expand Down Expand Up @@ -48,7 +48,7 @@ def conflict_in_year_bool(conflict_gdf, extent_gdf, sim_year):

return list_out

def conflict_in_previous_year(conflict_gdf, extent_gdf, sim_year, t_0_flag=None):
def conflict_in_previous_year(config, conflict_gdf, extent_gdf, sim_year, t_0_flag=None):
"""Creates a list for each timestep with boolean information whether a conflict took place in a polygon at the previous timestep or not.
If the current time step is the first (t=0), then conflict data of this year is used instead due to the lack of earlier data.
Expand All @@ -68,7 +68,7 @@ def conflict_in_previous_year(conflict_gdf, extent_gdf, sim_year, t_0_flag=None)

# if it is the first time step (t_0), the data of this year will be used
if t_0_flag == True:
print('... it is the first year, so no conflict for previous year is known')
if config.getboolean('general', 'verbose'): print('DEBUG: first year of simulation period -> conflict at t-1 set to conflict at t')
temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]
# else, the data from the previous time step (t-1) is used
elif t_0_flag == None:
Expand All @@ -79,17 +79,21 @@ def conflict_in_previous_year(conflict_gdf, extent_gdf, sim_year, t_0_flag=None)
# merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
data_merged = gpd.sjoin(temp_sel_year, extent_gdf)

fatalities_per_poly = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
# determine log-transformed count of unique conflicts per water province
# the id column refers to the conflict id, not the water province id!
if config.getboolean('general', 'verbose'): print('DEBUG: computing log-transformed count of conflicts at t-1')
conflicts_per_poly = np.log(data_merged.id.groupby(data_merged['watprovID']).count().to_frame())

# loop through all regions and check if exists in sub-set
# if so, this means that there was conflict and thus assign value 1
list_out = []
for i in range(len(extent_gdf)):
i_poly = extent_gdf.iloc[i]['watprovID']
if i_poly in fatalities_per_poly.index.values:
list_out.append(1)
if i_poly in conflicts_per_poly.index.values:
val = float(conflicts_per_poly.id.loc[conflicts_per_poly.index == i_poly].values[0])
list_out.append(val)
else:
list_out.append(0)
list_out.append(float(0.))

if not len(extent_gdf) == len(list_out):
raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))
Expand Down
6 changes: 3 additions & 3 deletions copro/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):
if key == 'conflict':

data_series = value
data_list = conflict.conflict_in_year_bool(conflict_gdf, polygon_gdf, sim_year)
data_list = conflict.conflict_in_year_bool(config, conflict_gdf, polygon_gdf, sim_year)
data_series = data_series.append(pd.Series(data_list), ignore_index=True)
XY[key] = data_series

Expand All @@ -94,7 +94,7 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):
data_series = value
if i==0: t_0_flag = True
else: t_0_flag = None
data_list = conflict.conflict_in_previous_year(conflict_gdf, polygon_gdf, sim_year, t_0_flag=t_0_flag)
data_list = conflict.conflict_in_previous_year(config, conflict_gdf, polygon_gdf, sim_year, t_0_flag=t_0_flag)
data_series = data_series.append(pd.Series(data_list), ignore_index=True)
XY[key] = data_series

Expand All @@ -114,7 +114,7 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):

else:

nc_ds = xr.open_dataset(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', key)))
nc_ds = xr.open_dataset(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', key)).rsplit(',')[0])

if (np.dtype(nc_ds.time) == np.float32) or (np.dtype(nc_ds.time) == np.float64):
data_series = value
Expand Down
98 changes: 75 additions & 23 deletions copro/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import rasterstats as rstats
import numpy as np
import os, sys
import math

import warnings
warnings.filterwarnings("ignore")

def nc_with_float_timestamp(extent_gdf, config, root_dir, var_name, sim_year, stat_func='mean'):
def nc_with_float_timestamp(extent_gdf, config, root_dir, var_name, sim_year):
"""This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
By default, the mean value of all cells within a polygon is computed.
The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
Expand All @@ -29,51 +30,79 @@ def nc_with_float_timestamp(extent_gdf, config, root_dir, var_name, sim_year, st
root_dir (str): path to location of cfg-file.
var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file.
sim_year (int): year for which data is extracted.
stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
Raises:
ValueError: raised if the extracted variable at a time step does not contain data
Returns:
list: list containing statistical value per polygon, i.e. with same length as extent_gdf
"""
# get path to netCDF-file.
# nc_fo = os.path.join(os.path.abspath(config.get('general', 'input_dir')),
# config.get('data', var_name))

nc_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))
data_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name)).rsplit(',')

if len(data_fo) != 3:
raise ValueError('ERROR: not all settings for input data set {} provided - it must contain of path, False/True, and statistical method'.format(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))))
else:
nc_fo = data_fo[0]
ln_flag = bool(data_fo[1])
stat_method = str(data_fo[2])

if config.getboolean('general', 'verbose'): print('DEBUG: calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))

print(nc_fo)
print(ln_flag)

# open nc-file with xarray as dataset
nc_ds = xr.open_dataset(nc_fo)
# get xarray data-array for specified variable
nc_var = nc_ds[var_name]

if ln_flag:
nc_var = np.log(nc_var)
if config.getboolean('general', 'verbose'): print('DEBUG: log-transform variable {}'.format(var_name))
# open nc-file with rasterio to get affine information
affine = rio.open(nc_fo).transform

# get values from data-array for specified year
nc_arr = nc_var.sel(time=sim_year)
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
raise ValueError('ERROR: the data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

# initialize output list
list_out = []
# loop through all polygons in geo-dataframe and compute statistics, then append to output file
print('DEBUG: computing zonal statistic with method {}'.format(stat_method))
for i in range(len(extent_gdf)):

# province i
prov = extent_gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
if (zonal_stats[0][stat_func] == None) and (config.getboolean('general', 'verbose')):

# compute zonal stats for this province
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_method)
val = zonal_stats[0][stat_method]

# # if specified, log-transform value
if ln_flag:
if config.getboolean('general', 'verbose'): print('DEBUG: log-transform variable {}'.format(var_name))
# works only if zonal stats is not None, i.e. if it's None it stays None
if val != None: val = np.log(val)

# in case log-transformed value results in -inf, replace with None
if val == -math.inf:
if config.getboolean('general', 'verbose'): print('INFO: set -inf to None')
val = None

# print a warning if result is None
if (val == None) and (config.getboolean('general', 'verbose')):
print('WARNING: NaN computed!')
list_out.append(zonal_stats[0][stat_func])

list_out.append(val)

if config.getboolean('general', 'verbose'): print('DEBUG: ... done.')

return list_out

def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name, sim_year, stat_func='mean'):
def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name, sim_year):
"""This function extracts a statistical value from a netCDF-file (specified in the config-file) for each polygon specified in extent_gdf for a given year.
By default, the mean value of all cells within a polygon is computed.
The resulting list does not contain additional meta-information about the files or polygons and is mostly intended for data-driven approaches such as machine learning.
Expand All @@ -90,7 +119,6 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name,
root_dir (str): path to location of cfg-file.
var_name (str): name of variable in nc-file, must also be the same under which path to nc-file is specified in cfg-file.
sim_year (int): year for which data is extracted.
stat_func (str, optional): Statistical function to be applied, choose from available options in rasterstats package. Defaults to 'mean'.
Raises:
ValueError: raised if specfied year cannot be found in years in nc-file
Expand All @@ -99,12 +127,16 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name,
Returns:
list: list containing statistical value per polygon, i.e. with same length as extent_gdf
"""
# get path to netCDF-file.
# nc_fo = os.path.join(os.path.abspath(config.get('general', 'input_dir')),
# config.get('data', var_name))

nc_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))

data_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name)).rsplit(',')

if len(data_fo) != 3:
raise ValueError('ERROR: not all settings for input data set {} provided - it must contain of path, False/True, and statistical method'.format(os.path.join(root_dir, config.get('general', 'input_dir'), config.get('data', var_name))))
else:
nc_fo = data_fo[0]
ln_flag = bool(data_fo[1])
stat_method = str(data_fo[2])

if config.getboolean('general', 'verbose'): print('DEBUG: calculating mean {0} per aggregation unit from file {1} for year {2}'.format(var_name, nc_fo, sim_year))

# open nc-file with xarray as dataset
Expand All @@ -114,28 +146,48 @@ def nc_with_continous_datetime_timestamp(extent_gdf, config, root_dir, var_name,
# get years contained in nc-file as integer array to be compatible with sim_year
years = pd.to_datetime(nc_ds.time.values).to_period(freq='Y').strftime('%Y').to_numpy(dtype=int)
if sim_year not in years:
raise ValueError('the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))
raise ValueError('ERROR: the simulation year {0} can not be found in file {1}'.format(sim_year, nc_fo))

# get index which corresponds with sim_year in years in nc-file
sim_year_idx = int(np.where(years == sim_year)[0])
# get values from data-array for specified year based on index
nc_arr = nc_var.sel(time=nc_ds.time.values[sim_year_idx])
nc_arr_vals = nc_arr.values
if nc_arr_vals.size == 0:
raise ValueError('no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))
raise ValueError('ERROR: no data was found for this year in the nc-file {}, check if all is correct'.format(nc_fo))

# open nc-file with rasterio to get affine information
affine = rio.open(nc_fo).transform

# initialize output list
list_out = []
# loop through all polygons in geo-dataframe and compute statistics, then append to output file
print('DEBUG: computing zonal statistic with method {}'.format(stat_method))
for i in range(len(extent_gdf)):

# province i
prov = extent_gdf.iloc[i]
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_func)
if (zonal_stats[0][stat_func] == None) and (config.getboolean('general', 'verbose')):

# compute zonal stats for this province
zonal_stats = rstats.zonal_stats(prov.geometry, nc_arr_vals, affine=affine, stats=stat_method)
val = zonal_stats[0][stat_method]

# # if specified, log-transform value
if ln_flag:
if config.getboolean('general', 'verbose'): print('DEBUG: log-transform variable {}'.format(var_name))
# works only if zonal stats is not None, i.e. if it's None it stays None
if val != None: val = np.log(val)

# in case log-transformed value results in -inf, replace with None
if val == -math.inf:
if config.getboolean('general', 'verbose'): print('INFO: set -inf to None')
val = None

# print a warning if result is None
if (val == None) and (config.getboolean('general', 'verbose')):
print('WARNING: NaN computed!')
list_out.append(zonal_stats[0][stat_func])

list_out.append(val)

if config.getboolean('general', 'verbose'): print('DEBUG: ... done.')

Expand Down
Binary file modified docs/_static/roc_curve.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 8 additions & 5 deletions example/example_settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ zones=BWh,BSh
code2class=KoeppenGeiger/classification_codes.txt

[data]
# variable name here needs to be identical with variable name in nc-file
total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
irr_water_demand=hydro/irrWaterDemand.nc
# specify the path to the nc-file, whether the variable shall be log-transformed (True, False), and which statistical function should be applied
# these three settings need to be separated by a comma
# NOTE: variable name here needs to be identical with variable name in nc-file
# NOTE: only statistical functions supported by rasterstats are valid
total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc,True,mean
precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,True,mean
temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,False,mean
irr_water_demand=hydro/irrWaterDemand.nc,False,sum

[machine_learning]
# choose from: MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
Expand Down
13 changes: 8 additions & 5 deletions example/example_settings_proj.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ zones=BWh,BSh
code2class=KoeppenGeiger/classification_codes.txt

[data]
# variable name here needs to be identical with variable name in nc-file
total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc
precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc
irr_water_demand=hydro/irrWaterDemand.nc
# specify the path to the nc-file, whether the variable shall be log-transformed (True, False), and which statistical function should be applied
# these three settings need to be separated by a comma
# NOTE: variable name here needs to be identical with variable name in nc-file
# NOTE: only statistical functions supported by rasterstats are valid
total_evaporation=hydro/totalEvaporation_monthTot_output_2000_2015_Africa_yearmean.nc,True,mean
precipitation=hydro/precipitation_monthTot_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,True,mean
temperature=hydro/temperature_monthAvg_output_2000-01-31_to_2015-12-31_Africa_yearmean.nc,False,mean
irr_water_demand=hydro/irrWaterDemand.nc,False,sum

[machine_learning]
# choose from: MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
Expand Down

0 comments on commit d4d9253

Please sign in to comment.