Merge pull request #111 from JannisHoch/poly_neighbours

Poly neighbours
JannisHoch · Dec 9, 2020 · b5cc738 · b5cc738
2 parents 79d847b + f4a6a15
commit b5cc738
Show file tree

Hide file tree

Showing 9 changed files with 331 additions and 40 deletions.
diff --git a/copro/conflict.py b/copro/conflict.py
@@ -3,20 +3,19 @@
 import numpy as np
 import os, sys
 
-def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year): 
+def conflict_in_year_bool(conflict_gdf, extent_gdf, sim_year): 
     """Creates a list for each timestep with boolean information whether a conflict took place in a polygon or not.
 
     Args:
-        conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data)
-        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted
-        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
-        sim_year (int): year for which data is extracted
+        conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data).
+        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted.
+        sim_year (int): year for which data is extracted.
 
     Raises:
-        AssertionError: raised if the length of output list does not match length of input geo-dataframe
+        AssertionError: raised if the length of output list does not match length of input geo-dataframe.
 
     Returns:
-        list: list containing 0/1 per polygon depending on conflict occurence
+        list: list containing 0/1 per polygon depending on conflict occurence.
     """    
 
     # select the entries which occured in this year
@@ -27,19 +26,67 @@ def conflict_in_year_bool(conflict_gdf, extent_gdf, config, sim_year):
 
     # determine the aggregated amount of fatalities in one region (e.g. water province)
     try:
-        fatalities_per_watProv = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+        fatalities_per_poly = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
     except:
-        fatalities_per_watProv = data_merged['best'].groupby(data_merged['name']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+        fatalities_per_poly = data_merged['best'].groupby(data_merged['name']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
 
     # loop through all regions and check if exists in sub-set
     # if so, this means that there was conflict and thus assign value 1
     list_out = []
     for i in range(len(extent_gdf)):
         try:
-            i_watProv = extent_gdf.iloc[i]['watprovID']
+            i_poly = extent_gdf.iloc[i]['watprovID']
         except:
-            i_watProv = extent_gdf.iloc[i]['name']
-        if i_watProv in fatalities_per_watProv.index.values:
+            i_poly = extent_gdf.iloc[i]['name']
+        if i_poly in fatalities_per_poly.index.values:
+            list_out.append(1)
+        else:
+            list_out.append(0)
+
+    if not len(extent_gdf) == len(list_out):
+        raise AssertionError('the dataframe with polygons has a lenght {0} while the lenght of the resulting list is {1}'.format(len(extent_gdf), len(list_out)))
+
+    return list_out
+
+def conflict_in_previous_year(conflict_gdf, extent_gdf, sim_year, t_0_flag=None):
+    """Creates a list for each timestep with boolean information whether a conflict took place in a polygon at the previous timestep or not.
+    If the current time step is the first (t=0), then conflict data of this year is used instead due to the lack of earlier data.
+
+    Args:
+        conflict_gdf (geodataframe): geo-dataframe containing georeferenced information of conflict (tested with PRIO/UCDP data).
+        extent_gdf (geodataframe): geo-dataframe containing one or more polygons with geometry information for which values are extracted.
+        sim_year (int): year for which data is extracted.
+        t_0_flag (bool, optional): Flag whether first time step is run. If so, needs to be set to True. Defaults to None.
+
+    Raises:
+        ValueError: raised if t_0_flag is invalid.
+        AssertionError: raised if the length of output list does not match length of input geo-dataframe.
+
+    Returns:
+        list: list containing 0/1 per polygon depending on conflict occurence.
+    """    
+
+    # if it is the first time step (t_0), the data of this year will be used
+    if t_0_flag == True:
+        print('... it is the first year, so no conflict for previous year is known')
+        temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year]
+    # else, the data from the previous time step (t-1) is used
+    elif t_0_flag == None:
+        temp_sel_year = conflict_gdf.loc[conflict_gdf.year == sim_year-1]  
+    else:
+        raise ValueError('ERROR: the t_0_flag should either be None or True.') 
+
+    # merge the dataframes with polygons and conflict information, creating a sub-set of polygons/regions
+    data_merged = gpd.sjoin(temp_sel_year, extent_gdf)
+
+    fatalities_per_poly = data_merged['best'].groupby(data_merged['watprovID']).sum().to_frame().rename(columns={"best": 'total_fatalities'})
+
+    # loop through all regions and check if exists in sub-set
+    # if so, this means that there was conflict and thus assign value 1
+    list_out = []
+    for i in range(len(extent_gdf)):
+        i_poly = extent_gdf.iloc[i]['watprovID']
+        if i_poly in fatalities_per_poly.index.values:
             list_out.append(1)
         else:
             list_out.append(0)

diff --git a/copro/data.py b/copro/data.py
@@ -22,7 +22,8 @@ def initiate_XY_data(config):
     XY['poly_geometry'] = pd.Series()
     for key in config.items('data'):
         XY[str(key[0])] = pd.Series(dtype=float)
-    XY['conflict'] = pd.Series(dtype=int)
+    XY['conflict_t-1'] = pd.Series(dtype=bool)
+    XY['conflict'] = pd.Series(dtype=bool)
 
     if config.getboolean('general', 'verbose'): print('{}'.format(XY) + os.linesep)
 
@@ -73,7 +74,8 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):
 
     # go through all simulation years as specified in config-file
     model_period = np.arange(config.getint('settings', 'y_start'), config.getint('settings', 'y_end') + 1, 1)
-    for sim_year in model_period:
+
+    for (sim_year, i) in zip(model_period, range(len(model_period))):
 
         print('INFO: entering year {}'.format(sim_year))
 
@@ -83,7 +85,16 @@ def fill_XY(XY, config, root_dir, conflict_gdf, polygon_gdf):
             if key == 'conflict':
 
                 data_series = value
-                data_list = conflict.conflict_in_year_bool(conflict_gdf, polygon_gdf, config, sim_year)
+                data_list = conflict.conflict_in_year_bool(conflict_gdf, polygon_gdf, sim_year)
+                data_series = data_series.append(pd.Series(data_list), ignore_index=True)
+                XY[key] = data_series
+
+            elif key == 'conflict_t-1':
+
+                data_series = value
+                if i==0: t_0_flag = True
+                else: t_0_flag = None
+                data_list = conflict.conflict_in_previous_year(conflict_gdf, polygon_gdf, sim_year, t_0_flag=t_0_flag)
                 data_series = data_series.append(pd.Series(data_list), ignore_index=True)
                 XY[key] = data_series
 
@@ -151,4 +162,29 @@ def split_XY_data(XY, config):
         fraction_Y_1 = 100*len(np.where(Y != 0)[0])/len(Y)
         print('DEBUG: a fraction of {} percent in the data corresponds to conflicts.'.format(round(fraction_Y_1, 2)))
 
-    return X, Y
+    return X, Y
+
+def neighboring_polys(config, extent_gdf, identifier='watprovID'):
+
+    # initialise empty dataframe
+    df = pd.DataFrame()
+
+    # go through each polygon aka water province
+    for i in range(len(extent_gdf)):
+        if config.getboolean('general', 'verbose'): print('DEBUG: finding touching neighbours for identifier {} {}'.format(identifier, extent_gdf[identifier].iloc[i]))
+        # get geometry of current polygon
+        wp = extent_gdf.geometry.iloc[i]
+        # check which polygons in geodataframe (i.e. all water provinces) touch the current polygon
+        # also create a dataframe from result (boolean)
+        # the transpose is needed to easier append
+        df_temp = pd.DataFrame(extent_gdf.geometry.touches(wp), columns=[extent_gdf[identifier].iloc[i]]).T
+        # append the dataframe
+        df = df.append(df_temp)
+
+    # replace generic indices with actual water province IDs
+    df.set_index(extent_gdf[identifier], inplace=True)
+
+    # replace generic columns with actual water province IDs
+    df.columns = extent_gdf[identifier].values
+
+    return df
diff --git a/copro/evaluation.py b/copro/evaluation.py
@@ -323,6 +323,7 @@ def get_feature_importance(clf, config, out_dir):
     dict_out = dict()
     for key, x in zip(config.items('data'), range(len(arr))):
         dict_out[key[0]] = arr[x]
+    dict_out['conflict_t-1'] = arr[-1]
 
     df = pd.DataFrame.from_dict(dict_out, orient='index', columns=['feature_importance'])
 

diff --git a/copro/machine_learning.py b/copro/machine_learning.py
@@ -78,14 +78,14 @@ def split_scale_train_test_split(X, Y, config, scaler):
         arrays: arrays containing training-data and test-data as well as IDs and geometry for training-data and test-data.
     """ 
 
-    ##- separate arrays for geomety and variable values
+    ##- separate arrays for ID, geometry, and sample values per polygon
     X_ID, X_geom, X_data = conflict.split_conflict_geom_data(X)
 
     if config.getboolean('general', 'verbose'): print('DEBUG: fitting and transforming X')
     ##- scaling only the variable values
     X_ft = scaler.fit_transform(X_data)
 
-    ##- combining geometry and scaled variable values
+    ##- combining ID, geometry and scaled sample values per polygon
     X_cs = np.column_stack((X_ID, X_geom, X_ft))
 
     if config.getboolean('general', 'verbose'): print('DEBUG: splitting both X and Y in train and test data')

diff --git a/copro/selection.py b/copro/selection.py
@@ -99,14 +99,8 @@ def climate_zoning(gdf, extent_gdf, config, root_dir):
         geo-dataframe: polygons of study area clipped to climate zones.
         dataframe: global look-up dataframe linking polygon ID with geometry information.
     """
-
-    # Koeppen_Geiger_fo = os.path.join(os.path.abspath(config.get('general', 'input_dir')),
-    #                                  config.get('climate', 'shp')) 
 
     Koeppen_Geiger_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('climate', 'shp'))
-
-    # code2class_fo = os.path.join(os.path.abspath(config.get('general', 'input_dir')),
-    #                              config.get('climate', 'code2class'))
 
     code2class_fo = os.path.join(root_dir, config.get('general', 'input_dir'), config.get('climate', 'code2class'))
 

diff --git a/docs/_static/roc_curve.png b/docs/_static/roc_curve.png
diff --git a/example/nb01_model_init_and_selection.ipynb b/example/nb01_model_init_and_selection.ipynb
diff --git a/example/nb02_XY_data.ipynb b/example/nb02_XY_data.ipynb
@@ -202,23 +202,170 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "{'poly_ID': Series([], dtype: float64), 'poly_geometry': Series([], dtype: float64), 'total_evaporation': Series([], dtype: float64), 'precipitation': Series([], dtype: float64), 'temperature': Series([], dtype: float64), 'irr_water_demand': Series([], dtype: float64), 'conflict_t-1': Series([], dtype: bool), 'conflict': Series([], dtype: bool)}\n",
+      "\n",
       "INFO: reading data for period from 2000 to 2015\n",
       "INFO: entering year 2000\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "... it is the first year, so no conflict for previous year is known\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2001\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2002\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2003\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2004\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2005\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2006\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2007\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2008\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2009\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2010\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2011\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2012\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2013\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2014\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: entering year 2015\n",
+      "DEBUG: key poly_ID\n",
+      "DEBUG: key poly_geometry\n",
+      "DEBUG: key total_evaporation\n",
+      "DEBUG: key precipitation\n",
+      "DEBUG: key temperature\n",
+      "DEBUG: key irr_water_demand\n",
+      "DEBUG: key conflict_t-1\n",
+      "YOOOOO: now computing conflict for t-1\n",
+      "DEBUG: key conflict\n",
       "INFO: all data read\n",
       "INFO: saving XY data by default to file C:\\Users\\hoch0001\\Documents\\_code\\copro\\example\\./OUT\\XY.npy\n"
      ]
@@ -254,6 +401,13 @@
    "source": [
     "os.path.isfile(os.path.join(os.path.abspath(config.get('general', 'output_dir')), 'XY.npy'))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/example/nb03_model_execution_and_evaluation.ipynb b/example/nb03_model_execution_and_evaluation.ipynb