In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import shapely.geometry
from shapely.geometry import Point
from mpl_toolkits.basemap import Basemap
import glob
import rasterio
import os
import shutil
import pyimpute
from pathlib import Path

In [3]:
BIO_PATH = "assets/wc2.1_30s_bio/*.tif"

In [None]:
presence_absence = pd.read_parquet("model/presence_absence.parquet")

In [4]:
raster_features = sorted(glob.glob(BIO_PATH))

In [None]:
# List of coordinates
coord_list = [(x,y) for x,y in zip(presence_absence['decimalLongitude'] , presence_absence['decimalLatitude'])]

In [None]:
# Point sampling
for f in raster_features:
    src = rasterio.open(f)
    presence_absence[Path(f).stem] = [x for x in src.sample(coord_list)]
    presence_absence[Path(f).stem] = presence_absence[Path(f).stem].astype('float64')

In [None]:
presence_absence

In [None]:
train_vec = presence_absence[presence_absence.columns.difference(['decimalLatitude', 'decimalLongitude'])]

In [None]:
train_vec

In [None]:
columns = list(train_vec.columns[1:])

In [None]:
nodata_df = train_vec[train_vec[columns].nunique(axis=1) == 1]
nodata_df.head()

In [None]:
# Saving training data
train_vec.drop(nodata_df.index).to_csv('TRAIN_VEC.csv',index=False)
train_vec = pd.read_csv("TRAIN_VEC.csv")
train_vec.head()

In [None]:
# training data: explanatory variables + response variables
train_xs, train_y = train_vec.iloc[:,1:].values, train_vec.iloc[:,0].values

train_xs.shape, train_y.shape

In [5]:
# target data: explanatory variables
# target_xs, raster_info = pyimpute.load_targets(raster_features[:3])

In [None]:
# ML 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn import model_selection 
from sklearn import metrics 

In [None]:
# ML classifier dictionary: name: (model)
CLASS_MAP = {
    'RF': (RandomForestClassifier()), 
    'ET': (ExtraTreesClassifier()), 
    'ADA' : (AdaBoostClassifier()), 
    'BAG' : (BaggingClassifier()), 
    'GRA' : (GradientBoostingClassifier()),
    'Maxent':(LogisticRegression())
    }

In [None]:
# ROC 
def plot_roc_curve(fper, tper):
    plt.plot(fper, tper, color='red', label='ROC')
    plt.plot([0, 1], [0, 1], color='green', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()

In [None]:
def evaluate_clf(
    clf, X, y, name, k=None, test_size=0.2, scoring="f1_weighted", feature_names=None
):
    X_train, X_test, y_train, y_true = model_selection.train_test_split(
        X, y,
        test_size=test_size, # Test data size
        shuffle=True, # Shuffle the data before split
        stratify=y # Keeping the appearance/non-appearance ratio of Y
    )

    if k: # Cross-validation
        kf = model_selection.KFold(n_splits=k) # k-fold
        scores = model_selection.cross_val_score(clf, X_train, y_train, cv=kf, scoring=scoring)
        print(name + " %d-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)"
              % (k, scores.mean() * 100, scores.std() * 200))
        print()
    
    clf.fit(X_train, y_train) # Training of classifiers
    y_pred = clf.predict(X_test) # Classifier predictions
    
    # Classifier evaluation metrics
    print("Accuracy Score: %.2f" % metrics.accuracy_score(y_true, y_pred))
    print()

    print("Classification report")
    print(metrics.classification_report(y_true, y_pred))
    print()

    print("Confussion matrix")
    print(metrics.confusion_matrix(y_true, y_pred))
    print()
    
    print('AUC(ROC): %.2f'% metrics.roc_auc_score(y_true, y_pred))
    print()
       
    # ROC 
    probs = clf.predict_proba(X_test) 
    prob = probs[:, 1]  
    fper, tper, thresholds = metrics.roc_curve(y_true, prob)
    plot_roc_curve(fper, tper)

    if hasattr(clf, "feature_importances_"):
        print("Feature importances")
        for f, imp in zip(feature_names, clf.feature_importances_):
            print("%20s: %s" % (f, round(imp * 100, 1)))
        print()

In [None]:
# Spatial prediction
for name, model in CLASS_MAP.items():
    evaluate_clf(model, train_xs, train_y, name, k=5, test_size=0.2, scoring="f1_weighted", feature_names=columns)
    try:
        os.mkdir('OUTPUT/' + name + '-IMAGES')        
    except:
        pass    
    #pyimpute.impute(target_xs, model, raster_info, outdir='OUTPUT/' + name + '-IMAGES', class_prob=True, certainty=True)