Merge pull request #139 from JannisHoch/update_docs

Update docs
JannisHoch · Jun 4, 2021 · 4e2d3fe · 4e2d3fe
2 parents 55df87f + ccfd370
commit 4e2d3fe
Show file tree

Hide file tree

Showing 11 changed files with 317 additions and 502 deletions.
diff --git a/copro/__init__.py b/copro/__init__.py
@@ -13,4 +13,4 @@
 
 __author__ = """Jannis M. Hoch, Sophie de Bruin, Niko Wanders"""
 __email__ = 'j.m.hoch@uu.nl'
-__version__ = '0.0.8b'
+__version__ = '0.0.8'
diff --git a/copro/plots.py b/copro/plots.py
@@ -2,7 +2,8 @@
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import geopandas as gpd
-import seaborn as sbs
+import seaborn as sns
+sns.set_palette('colorblind')
 import numpy as np
 import os, sys
 from sklearn import metrics
@@ -42,7 +43,7 @@ def selected_conflicts(conflict_gdf, **kwargs):
 
     return ax
 
-def metrics_distribution(out_dict, **kwargs):
+def metrics_distribution(out_dict, metrics, **kwargs):
     """Plots the value distribution of a range of evaluation metrics based on all model simulations.
 
     Args:
@@ -57,9 +58,10 @@ def metrics_distribution(out_dict, **kwargs):
 
     fig, ax = plt.subplots(1, 1, **kwargs)
 
-    sbs.histplot(out_dict['Accuracy'], ax=ax, color="k", label='Accuracy')
-    sbs.histplot(out_dict['Precision'], ax=ax, color="r", label='Precision')
-    sbs.histplot(out_dict['Recall'], ax=ax, color="b", label='Recall')
+    for metric, color in zip(metrics, sns.color_palette('colorblind')):
+
+        sns.histplot(out_dict[str(metric)], ax=ax, kde=True, stat='density', color=color, label=str(metric))
+
     plt.legend()
 
     return ax
@@ -79,7 +81,7 @@ def correlation_matrix(df, **kwargs):
 
     df_corr = evaluation.calc_correlation_matrix(df)
 
-    ax = sbs.heatmap(df_corr, **kwargs)
+    ax = sns.heatmap(df_corr, **kwargs)
 
     return ax
 

diff --git a/...pts/postprocessing/plot_prediction_dev.py → ...ts/postprocessing/plot_value_over_time.py b/...pts/postprocessing/plot_prediction_dev.py → ...ts/postprocessing/plot_value_over_time.py
@@ -7,7 +7,7 @@
 import os
 
 @click.command()
-@click.option('-id', '--polygon-id', multiple=True, type=int)
+@click.option('-id', '--polygon-id', multiple=True)
 @click.option('-c', '--column', help='column name', default='chance_of_conflict', type=str)
 @click.option('-t', '--title', help='title for plot and file_object name', type=str)
 @click.option('--verbose/--no-verbose', help='verbose on/off', default=False)
@@ -18,50 +18,87 @@ def main(input_dir=None, polygon_id=None, column=None, title=None, output_dir=No
     """Quick and dirty function to plot the develoment of a column in the outputted geojson-files over time.
     """
 
-    assert(len(polygon_id) > 0), AssertionError('please specify one polygon ID to be sampled')
+    assert(len(polygon_id) > 0), AssertionError('please specify at least one polygon ID to be sampled or select ''all'' for sampling the entire study area')
 
+    # absolute path to input_dir
     input_dir = os.path.abspath(input_dir)
     click.echo('\ngetting geojson-files from {}'.format(input_dir))
 
+    # collect all files in input_dir
     all_files = glob.glob(os.path.join(input_dir, '*.geojson'))
 
+    if verbose:
+        if polygon_id != 'all': 
+            click.echo('sampling from IDs'.format(polygon_id))
+        else:
+            click.echo('sampling over entire study area')
+
+    # create dictionary with list for areas (either IDs or entire study area) to be sampled from
     out_dict = dict()
     for idx in polygon_id:
-        out_dict[idx] = list()
+        if polygon_id != 'all':
+            out_dict[int(idx)] = list()
+        else:
+            out_dict[idx] = list()
 
+    # create a list to keep track of year-values in files
     years = list()
 
-    print('retrieving values from column {}'.format(column))
+    # go through all files
+    click.echo('retrieving values from column {}'.format(column))
     for geojson in all_files:
-        if verbose: print('reading file {}'.format(geojson))
+
+        if verbose: click.echo('reading file {}'.format(geojson))
+        # read file and convert to geo-dataframe
         gdf = gpd.read_file(geojson, driver='GeoJSON')
+        # convert geo-dataframe to dataframe
         df = pd.DataFrame(gdf.drop(columns='geometry'))
 
+        # get year-value
         year = int(str(str(os.path.basename(geojson)).rsplit('.')[0]).rsplit('_')[-1])
         years.append(year)
 
-        for idx in polygon_id:
-            if verbose: print('sampling ID {}'.format(idx))
-
-            if idx not in df.ID.values: 
-                print('WARNING: ID {} is not in {} - NaN set'.format(idx, geojson))
-                vals = np.nan
-            else:
-                vals = df[column].loc[df.ID==idx].values[0]
-
+        if polygon_id != 'all':
+            # go throough all IDs
+            for idx in polygon_id:
+                if verbose: 
+                    if polygon_id != 'all': 
+                        print('sampling ID {}'.format(idx))
+
+                # if ID not in file, assign NaN
+                if idx not in df.ID.values: 
+                    print('WARNING: ID {} is not in {} - NaN set'.format(idx, geojson))
+                    vals = np.nan
+                # otherwise, get value of column at this ID
+                else:
+                    vals = df[column].loc[df.ID==idx].values[0]
+
+                # append this value to list in dict
+                idx_list = out_dict[idx]
+                idx_list.append(vals)
+
+        else:
+            # compute mean value over column
+            vals = df[column].mean()
+            # append this value to list in dict
             idx_list = out_dict[idx]
             idx_list.append(vals)
 
+
+    # create a dataframe from dict and assign year-values as index
     df = pd.DataFrame().from_dict(out_dict)
     years = pd.to_datetime(years, format='%Y')
     df.index = years
 
+    # create an output folder, if not yet there
     if not os.path.isdir(os.path.abspath(output_dir)):
         click.echo('creating output folder {}'.format(os.path.abspath(output_dir)))
         os.makedirs(os.path.abspath(output_dir))
 
+    # save dataframe as csv-file
     df.to_csv(os.path.abspath(os.path.join(output_dir, '{}_dev.csv'.format(column))))
 
+    # create a simple plot and save to file
     fig, axes = plt.subplots(nrows=len(polygon_id), ncols=1, sharex=True)
     df.plot(subplots=True, ax=axes)
     for ax in axes:

diff --git a/docs/_static/roc_curve.png b/docs/_static/roc_curve.png
diff --git a/example/_scripts/run_notebooks.sh b/example/_scripts/run_notebooks.sh
diff --git a/example/nb01_model_init_and_selection.ipynb b/example/nb01_model_init_and_selection.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Model initialization and selection procedure\n",
     "\n",
-    "In this notebook, we will show how CoPro is initialized and how the polygons and conflicts are selected."
+    "In this notebook, we will show how CoPro is initialized and the selection procedure of spatial aggregation units and conflicts works."
    ]
   },
   {
@@ -40,7 +40,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For better reproducibility, the version numbers of all key packages are provided."
+    "For better reproducibility, the version numbers of all key packages used to run this notebook are provided."
    ]
   },
   {
@@ -53,7 +53,7 @@
      "output_type": "stream",
      "text": [
       "Python version: 3.7.8 | packaged by conda-forge | (default, Jul 31 2020, 01:53:57) [MSC v.1916 64 bit (AMD64)]\n",
-      "copro version: 0.0.8b\n",
+      "copro version: 0.0.8\n",
       "geopandas version: 0.9.0\n",
       "xarray version: 0.15.1\n",
       "rasterio version: 1.1.0\n",
@@ -74,14 +74,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### The configurations-file (cfg-file)"
+    "## The configurations-file (cfg-file)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In the cfg-file, all the settings for the analysis are defined. Note that the cfg-file can be stored anywhere, not per se in the same directory where the model data is stored (as in this example case). Make sure that the paths in the cfg-file are updated if you use relative paths and change the folder location of th cfg-file."
+    "In the configurations-file (cfg-file), all the settings for the analysis are defined. The cfg-file contains, amongst others, all paths to input files, settings for the machine-learning model, and the various selection criteria for spatial aggregation units and conflicts. Note that the cfg-file can be stored anywhere, not per se in the same directory where the model data is stored (as in this example case). Make sure that the paths in the cfg-file are updated if you use relative paths and change the folder location of th cfg-file!"
    ]
   },
   {
@@ -97,7 +97,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Based on this cfg-file, the set-up of the run can be initialized. Here, the cfg-file is parsed (i.e. read) and all settings and paths become known to the model. Also, the output folder is created (if it does not exist yet) and the cfg-file is copied to the output folder for improved reusability."
+    "Based on this cfg-file, the set-up of the run can be initialized. Here, the cfg-file is parsed (i.e. read) and all settings and paths become 'known' to the model. Also, the output folder is created (if it does not exist yet) and the cfg-file is copied to the output folder for improved reusability.\n",
+    "\n",
+    "If you set `verbose=True`, then additional statements are printed during model execution. This can help to track the behaviour of the model."
    ]
   },
   {
@@ -110,7 +112,7 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "#### CoPro version 0.0.8b ####\n",
+      "#### CoPro version 0.0.8 ####\n",
       "#### For information about the model, please visit https://copro.readthedocs.io/ ####\n",
       "#### Copyright (2020-2021): Jannis M. Hoch, Sophie de Bruin, Niko Wanders ####\n",
       "#### Contact via: j.m.hoch@uu.nl ####\n",
@@ -130,7 +132,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "One of the outputs is a dictionary containing the parsed configurations as well as output directories of both the reference run and the various projection runs:"
+    "One of the outputs is a dictionary (here `main_dict`) containing the parsed configurations (they are stored in computer memory, therefore the slighly odd specification) as well as output directories of both the reference run and the various projection runs specified in the cfg-file. \n",
+    "\n",
+    "For the reference run, only the respective entries are required."
    ]
   },
   {
@@ -139,65 +143,46 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "{'_REF': [<configparser.RawConfigParser at 0x1b251bac548>,\n",
-       "  'C:\\\\Users\\\\hoch0001\\\\Documents\\\\_code\\\\copro\\\\example\\\\./OUT\\\\_REF'],\n",
-       " 'proj_nr_1': [[<configparser.RawConfigParser at 0x1b251f8ce88>],\n",
-       "  'C:\\\\Users\\\\hoch0001\\\\Documents\\\\_code\\\\copro\\\\example\\\\./OUT\\\\_PROJ\\\\proj_nr_1']}"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "the configuration of the reference run is <configparser.RawConfigParser object at 0x000002AE32A09E88>\n",
+      "the output directory of the reference run is C:\\Users\\hoch0001\\Documents\\_code\\copro\\example\\./OUT\\_REF\n"
+     ]
     }
    ],
-   "source": [
-    "main_dict"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For the reference run, only the respective entries are required."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "config_REF = main_dict['_REF'][0]\n",
-    "out_dir_REF = main_dict['_REF'][1] "
+    "print('the configuration of the reference run is {}'.format(config_REF))\n",
+    "out_dir_REF = main_dict['_REF'][1] \n",
+    "print('the output directory of the reference run is {}'.format(out_dir_REF))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Filter conflicts and polygons\n",
+    "## Filter conflicts and spatial aggregation units\n",
     "\n",
     "### Background\n",
     "\n",
-    "As conflict database, we use the [UCDP Georeferenced Event Dataset](https://ucdp.uu.se/downloads/index.html#ged_global) v201. Not all conflicts of the database may always need to be used for a simulation. This can be, for example, because they belong to a non-relevant type of conflict we are not interested in, or because it is simply not in our area-of-interest. Therefore, it is possible to filter the conflicts on various properties:\n",
+    "As conflict database, we use the [UCDP Georeferenced Event Dataset](https://ucdp.uu.se/downloads/index.html#ged_global). Not all conflicts of the database may need to be used for a simulation. This can be, for example, because they belong to a non-relevant type of conflict we are not interested in, or because it is simply not in our area-of-interest. Therefore, it is possible to filter the conflicts on various properties:\n",
     "\n",
-    "1. min_nr_casualties: minimum number of casualties of a reported conflict; \n",
-    "1. type_of_violence: 1=state-based armed conflict; 2=non-state conflict; 3=one-sided violence.\n",
+    "1. *min_nr_casualties: minimum number of casualties of a reported conflict;* \n",
+    "1. *type_of_violence: 1=state-based armed conflict; 2=non-state conflict; 3=one-sided violence.*\n",
     "\n",
     "To unravel the interplay between climate and conflict, it may be beneficial to run the model only for conflicts in particular climate zones. It is hence also possible to select only those conflcits that fall within a climate zone following the [Koeppen-Geiger classification](http://koeppen-geiger.vu-wien.ac.at/).\n",
     "\n",
     "### Selection procedure\n",
     "\n",
     "In the selection procedure, we first load the conflict database and convert it to a georeferenced dataframe (geo-dataframe). To define the study area, a shape-file containing polygons (in this case water provinces) is loaded and converted to geo-dataframe as well.\n",
     "\n",
-    "We then apply the selection criteria (see above) as specified in the cfg-file, and keep the remaining data points and polygons. "
+    "We then apply the selection criteria (see above) as specified in the cfg-file, and keep the remaining data points and associated polygons. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -222,7 +207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -251,12 +236,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "It's nicely visible that for this example-run, not all provinces are considered but we focus on specified climate zones only."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Temporary files\n",
+    "\n",
     "To be able to also run the following notebooks, some of the data has to be written to file temporarily. This is **not** part of the CoPro workflow but merely needed to split up the workflow in different notebooks outlining the main steps to go through when using CoPro."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -266,7 +260,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -276,7 +270,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [