Merge pull request #97 from JannisHoch/add_paper

Add paper
JannisHoch · Oct 7, 2020 · 9b99760 · 9b99760
2 parents e2eb8a2 + 4f5a5c2
commit 9b99760
Show file tree

Hide file tree

Showing 53 changed files with 578 additions and 2,220 deletions.
diff --git a/README.rst b/README.rst
@@ -2,24 +2,24 @@
 Overview
 ===============
 
-The conflict_model
+CoPro
 ----------------
-(Machine learning) model for mapping environmental drivers of conflict risk.
+a machine-learning tool for conflict risk projections based on climate, environmental, and societal drivers.
 
-.. image:: https://travis-ci.com/JannisHoch/conflict_model.svg?token=BnX1oxxHRbyd1dPyXAp2&branch=dev
-    :target: https://travis-ci.com/JannisHoch/conflict_model
+.. image:: https://travis-ci.com/JannisHoch/copro.svg?token=BnX1oxxHRbyd1dPyXAp2&branch=dev
+    :target: https://travis-ci.com/JannisHoch/copro
 
 .. image:: https://img.shields.io/badge/License-MIT-blue.svg
-    :target: https://github.com/JannisHoch/conflict_model/blob/dev/LICENSE
+    :target: https://github.com/JannisHoch/copro/blob/dev/LICENSE
 
-.. image:: https://readthedocs.org/projects/conflict-model/badge/?version=dev
-    :target: https://conflict-model.readthedocs.io/en/dev/?badge=dev
+.. image:: https://readthedocs.org/projects/copro/badge/?version=dev
+    :target: https://copro.readthedocs.io/en/dev/?badge=dev
+
+.. image:: https://img.shields.io/github/v/release/JannisHoch/copro
+    :target: https://github.com/JannisHoch/copro/releases/tag/v0.0.3
 
-.. image:: https://img.shields.io/github/v/release/JannisHoch/conflict_model
-    :target: https://github.com/JannisHoch/conflict_model/releases/tag/v0.0.5-pre
-
 .. image:: https://zenodo.org/badge/254407279.svg
-   :target: https://zenodo.org/badge/latestdoi/254407279
+    :target: https://zenodo.org/badge/latestdoi/254407279
 
 .. image:: https://badges.frapsoft.com/os/v2/open-source.svg?v=103
     :target: https://github.com/ellerbrock/open-source-badges/
@@ -32,10 +32,10 @@ You can then install the model package into this environment.
 
 .. code-block:: console
 
-    $ git clone https://github.com/JannisHoch/conflict_model.git
-    $ cd path/to/conflict_model
+    $ git clone https://github.com/JannisHoch/copro.git
+    $ cd path/to/copro
     $ conda env create -f environment.yml
-    $ conda activate conflict_model
+    $ conda activate copro
     $ python setup.py develop
 
 Execution
@@ -45,7 +45,7 @@ To be able to run the model, the conda environment has to be activated first.
 
 .. code-block:: console
 
-    $ conda activate conflict_model
+    $ conda activate copro
 
 Example notebook
 ^^^^^^^^^^^^^^^^^^
@@ -55,7 +55,7 @@ They can all be run and converted to htmls by executing the provided shell-scrip
 
 .. code-block:: console
 
-    $ cd path/to/conflict_model/example
+    $ cd path/to/copro/example
     $ sh run.sh
 
 It is of course also possible to execute the notebook cell by cell using jupyter notebook.
@@ -68,7 +68,7 @@ All data and settings are retrieved from the settings-file which needs to be pro
 
 .. code-block:: console
 
-    $ cd path/to/conflict_model/scripts
+    $ cd path/to/copro/scripts
     $ python runner.py ../example/example_settings.cfg
 
 By default, output is stored to the output directory specified in the settings-file. 

diff --git a/conflict_model/__init__.py → copro/__init__.py b/conflict_model/__init__.py → copro/__init__.py
diff --git a/conflict_model/conflict.py → copro/conflict.py b/conflict_model/conflict.py → copro/conflict.py
diff --git a/conflict_model/data.py → copro/data.py b/conflict_model/data.py → copro/data.py
@@ -1,4 +1,4 @@
-from conflict_model import conflict, variables
+from copro import conflict, variables
 import numpy as np
 import xarray as xr
 import pandas as pd

diff --git a/conflict_model/evaluation.py → copro/evaluation.py b/conflict_model/evaluation.py → copro/evaluation.py
@@ -282,14 +282,14 @@ def calc_kFold_polygon_analysis(y_df, global_df, out_dir, k=10):
 
     return gdf
 
-def get_feature_importance(clf, out_dir, config):
+def get_feature_importance(clf, config, out_dir):
     """Determines relative importance of each feature (i.e. variable) used. Must be used after model/classifier is fit.
     Returns dataframe and saves it to csv too.
 
     Args:
         clf (classifier): sklearn-classifier used in the simulation.
-        out_dir (str): path to output folder. If None, output is not saved.
         config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        out_dir (str): path to output folder. If None, output is not saved.
 
     Returns:
         dataframe: dataframe containing feature importance.
@@ -307,6 +307,7 @@ def get_feature_importance(clf, out_dir, config):
 
     df = pd.DataFrame.from_dict(dict_out, orient='index', columns=['feature_importance'])
 
-    df.to_csv(os.path.join(out_dir, 'feature_importance.csv'))
+    if out_dir != None:
+        df.to_csv(os.path.join(out_dir, 'feature_importance.csv'))
 
     return df
diff --git a/conflict_model/machine_learning.py → copro/machine_learning.py b/conflict_model/machine_learning.py → copro/machine_learning.py
@@ -3,7 +3,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn import svm, neighbors, ensemble, preprocessing, model_selection, metrics
-from conflict_model import conflict
+from copro import conflict
 
 def define_scaling(config):
     """Defines scaling method based on model configurations.

diff --git a/conflict_model/models.py → copro/models.py b/conflict_model/models.py → copro/models.py
@@ -1,4 +1,4 @@
-from conflict_model import machine_learning, conflict, utils, evaluation
+from copro import machine_learning, conflict, utils, evaluation
 import pandas as pd
 import numpy as np
 

diff --git a/conflict_model/pipeline.py → copro/pipeline.py b/conflict_model/pipeline.py → copro/pipeline.py
@@ -1,4 +1,4 @@
-from conflict_model import models, data, machine_learning, evaluation
+from copro import models, data, machine_learning, evaluation
 import pandas as pd
 import numpy as np
 import os, sys

diff --git a/copro/plots.py b/copro/plots.py
@@ -0,0 +1,188 @@
+import matplotlib.pyplot as plt
+import geopandas as gpd
+import seaborn as sbs
+import numpy as np
+import os, sys
+from sklearn import metrics
+from copro import evaluation
+
+def selected_polygons(polygon_gdf, **kwargs):
+    """Creates a plotting instance of the boundaries of all selected polygons.
+
+    Args:
+        polygon_gdf (geo-dataframe): geo-dataframe containing the selected polygons.
+
+    Kwargs:
+        Geopandas-supported keyword arguments.
+
+    Returns:
+        ax: Matplotlib axis object.   
+    """    
+
+    ax = polygon_gdf.boundary.plot(**kwargs)
+
+    return ax
+
+def selected_conflicts(conflict_gdf, **kwargs):
+    """Creates a plotting instance of the best casualties estimates of the selected conflicts.
+
+    Args:
+        conflict_gdf (geo-dataframe): geo-dataframe containing the selected conflicts.
+
+    Kwargs:
+        Geopandas-supported keyword arguments.
+
+    Returns:
+        ax: Matplotlib axis object.   
+    """       
+
+    ax = conflict_gdf.plot(column='best', **kwargs)
+
+    return ax
+
+def metrics_distribution(out_dict, **kwargs):
+    """Plots the value distribution of a range of evaluation metrics based on all model simulations.
+
+    Args:
+        out_dict (dict): dictionary containing metrics score for various metrics and all simulation.
+
+    Kwargs:
+        Matplotlib-supported keyword arguments.
+
+    Returns:
+        ax: Matplotlib axis object.    
+    """    
+
+    fig, ax = plt.subplots(1, 1, **kwargs)
+
+    sbs.distplot(out_dict['Accuracy'], ax=ax, color="k", label='Accuracy')
+    sbs.distplot(out_dict['Precision'], ax=ax, color="r", label='Precision')
+    sbs.distplot(out_dict['Recall'], ax=ax, color="b", label='Recall')
+    plt.legend()
+
+    return ax
+
+def correlation_matrix(df, **kwargs):
+    """Plots the correlation matrix of a dataframe.
+
+    Args:
+        df (dataframe): dataframe containing columns to be correlated.
+
+    Kwargs:
+        Seaborn-supported keyword arguments.
+
+    Returns:
+        ax: Matplotlib axis object.    
+    """    
+
+    df_corr = evaluation.calc_correlation_matrix(df)
+
+    ax = sbs.heatmap(df_corr, **kwargs)
+
+    return ax
+
+def polygon_categorization(gdf, category='sub', method='median', **kwargs):
+    """Plots the categorization of polygons based on chance of correct prediction and number of conflicts.
+
+    Main categories are:
+        * H: chance of correct prediction higher than treshold;
+        * L: chance of correct prediction lower than treshold.
+
+    Sub-categories are:
+        * HH: high chance of correct prediction with high number of conflicts;
+        * HL: high chance of correct prediction with low number of conflicts;
+        * LH: low chance of correct prediction with high number of conflicts;
+        * LL: low chance of correct prediction with low number of conflicts.
+
+    Args:
+        gdf (geo-dataframe): containing model evaluation per unique polygon.
+        out_dir (str): path to output folder
+        method (str, optional): Statistical method used to determine categorization threshold. Defaults to 'median'.
+
+    Kwargs:
+        Matplotlib-supported keyword arguments.
+
+    Returns:
+        ax: Matplotlib axis object.        
+    """    
+
+    gdf = evaluation.categorize_polys(gdf, category, method)
+
+    ax = gdf.plot(column='category', **kwargs)
+
+    return ax
+
+def plot_ROC_curve_n_times(ax, clf, X_test, y_test, tprs, aucs, mean_fpr, **kwargs):
+    """Plots the ROC-curve per model simulation to a pre-initiated matplotlib-instance.
+
+    Args:
+        ax (axis): axis of pre-initaited matplotlib-instance
+        clf (classifier): sklearn-classifier used in the simulation.
+        X_test (array): array containing test-sample variable values.
+        y_test (list): list containing test-sample conflict data.
+        tprs (list): list with false positive rates.
+        aucs (list): list with area-under-curve values.
+        mean_fpr (array): array with mean false positive rate.
+
+    Returns:
+        list: lists with true positive rates and area-under-curve values per plot.
+    """    
+
+    viz = metrics.plot_roc_curve(clf, X_test, y_test, ax=ax,
+                            	 alpha=0.15, color='b', lw=1, label=None, **kwargs)
+
+    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
+    interp_tpr[0] = 0.0
+    tprs.append(interp_tpr)
+    aucs.append(viz.roc_auc)
+
+    return tprs, aucs
+
+def plot_ROC_curve_n_mean(ax, tprs, aucs, mean_fpr, **kwargs):
+    """Plots the mean ROC-curve to a pre-initiated matplotlib-instance.
+
+    Args:
+        ax (axis): axis of pre-initaited matplotlib-instance
+        tprs (list): list with false positive rates.
+        aucs (list): list with area-under-curve values.
+        mean_fpr (array): array with mean false positive rate.
+    """    
+
+    mean_tpr = np.mean(tprs, axis=0)
+    mean_tpr[-1] = 1.0
+    mean_auc = metrics.auc(mean_fpr, mean_tpr)
+    std_auc = np.std(aucs)
+    ax.plot(mean_fpr, mean_tpr, color='r',
+            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
+            lw=2, alpha=.8, **kwargs)
+
+    std_tpr = np.std(tprs, axis=0)
+    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
+    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
+    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=None, **kwargs)
+
+    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], **kwargs)
+
+    ax.legend(loc="lower right")
+
+def factor_importance(clf, config, out_dir=None, **kwargs):
+    """Plots the relative importance of each factor as bar plot. Note, this works only for RFClassifier as ML-model!
+
+    Args:
+        clf (classifier): sklearn-classifier used in the simulation.
+        config (ConfigParser-object): object containing the parsed configuration-settings of the model.
+        out_dir (str): path to output folder. If None, output is not saved.
+
+    Kwargs:
+        Matplotlib-supported keyword arguments.
+
+    Returns:
+        ax: Matplotlib axis object.
+    """    
+
+    df = evaluation.get_feature_importance(clf, config, out_dir)
+
+    ax = df.plot.bar(**kwargs)
+
+    return ax
+
diff --git a/conflict_model/selection.py → copro/selection.py b/conflict_model/selection.py → copro/selection.py
@@ -2,7 +2,7 @@
 import geopandas as gpd
 import numpy as np
 import os, sys
-from conflict_model import utils
+from copro import utils
 
 def filter_conflict_properties(gdf, config):
     """Filters conflict database according to certain conflict properties such as number of casualties, type of violence or country.

diff --git a/conflict_model/utils.py → copro/utils.py b/conflict_model/utils.py → copro/utils.py
@@ -40,7 +40,7 @@ def show_versions():
     """Prints the version numbers by the main python-packages used.
     """ 
 
-    from conflict_model import __version__ as cm_version
+    from copro import __version__ as cm_version
     from geopandas import __version__ as gpd_version
     from pandas import __version__ as pd_version
     from numpy import __version__ as np_version
@@ -57,7 +57,7 @@ def show_versions():
         sys.exit('please upgrade geopandas to version 0.7.0, your current version is {}'.format(gpd_version))
 
     print("Python version: {}".format(os_version))
-    print("conflict_model version: {}".format(cm_version))
+    print("copro version: {}".format(cm_version))
     print("geopandas version: {}".format(gpd_version))
     print("xarray version: {}".format(xr_version))
     print("rasterio version: {}".format(rio_version))

diff --git a/conflict_model/variables.py → copro/variables.py b/conflict_model/variables.py → copro/variables.py
diff --git a/docs/Execution.rst b/docs/Execution.rst
@@ -5,7 +5,7 @@ To be able to run the model, the conda environment has to be activated first.
 
 .. code-block:: console
 
-    $ conda activate conflict_model
+    $ conda activate copro
 
 Example notebook
 -----------------
@@ -15,7 +15,7 @@ They can all be run and converted to htmls by executing the provided shell-scrip
 
 .. code-block:: console
 
-    $ cd path/to/conflict_model/example
+    $ cd path/to/copro/example
     $ sh run.sh
 
 It is of course also possible to execute the notebook cell by cell using jupyter notebook.
@@ -28,7 +28,7 @@ All data and settings are retrieved from the settings-file which needs to be pro
 
 .. code-block:: console
 
-    $ cd path/to/conflict_model/scripts
+    $ cd path/to/copro/scripts
     $ python runner.py ../example/example_settings.cfg
 
 By default, output is stored to the output directory specified in the settings-file. 
diff --git a/docs/Installation.rst b/docs/Installation.rst
@@ -9,10 +9,10 @@ You can then install the model package into this environment.
 
 .. code-block:: console
 
-    $ git clone https://github.com/JannisHoch/conflict_model.git
-    $ cd path/to/conflict_model
+    $ git clone https://github.com/JannisHoch/copro.git
+    $ cd path/to/copro
     $ conda env create -f environment.yml
-    $ conda activate conflict_model
+    $ conda activate copro
     $ python setup.py develop
 
 From PyPI

diff --git a/docs/api/XYdata.rst b/docs/api/XYdata.rst
@@ -1,7 +1,7 @@
 XY-Data
 =================================
 
-.. currentmodule:: conflict_model
+.. currentmodule:: copro
 
 .. autosummary::
    :toctree: generated/