In [1]:
from argparse import ArgumentParser, Namespace
import h5py
from itertools import permutations
from pathlib import Path
from typing import cast, Optional, List, Tuple, Dict, Type, TypeVar, Sequence
from tqdm import tqdm
import sys

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset
from shapely.geometry import Point
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from cropharvest.datasets import CropHarvest, CropHarvestLabels, Task
from cropharvest.columns import NullableColumns, RequiredColumns
from cropharvest.config import FEATURES_DIR
from cropharvest.engineer import Engineer
from cropharvest.utils import load_normalizing_dict
from cropharvest.bands import BANDS, DYNAMIC_BANDS, STATIC_BANDS, REMOVED_BANDS

sys.path.append("..")

from src.models import STR2MODEL, STR2BASE, train_model
from src.models.data import LandTypeClassificationDataset, NigeriaCropHarvestDataset, GeowikiCropHarvestDataset

S2_BANDS = ['B2','B3','B4','B5','B6','B7','B8','B8A','B9','B11','B12','NDVI']

In [2]:
DATA_DIR = Path("../data")

In [3]:
# Helper functions
def get_metrics(y_true, y_probs):
    roc_auc = roc_auc_score(y_true, y_probs)
    y_pred = (y_probs > 0.5).astype(int)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)

    print('RF roc-auc test set:', round(roc_auc, 3))
    print('RF precision test set:', round(precision, 3))
    print('RF recall test set:', round(recall, 3))
    print('RF f1-score test set:', round(f1, 3))
    print('RF accuracy test set:', round(acc, 3))

    return {'roc_auc': roc_auc, 'precision': precision, 'recall': recall, 'f1': f1, 'acc': acc}

def get_model(add_geowiki: bool, add_nigeria: bool, geowiki_subset: str):
    parser = ArgumentParser()
    parser.add_argument("--max_epochs", type=int, default=100)
    parser.add_argument("--patience", type=int, default=10)
    parser.add_argument("--gpus", type=int, default=0)
    parser.add_argument("--wandb", default=False, action="store_true")

    model_args = STR2MODEL["land_cover"].add_model_specific_args(parser).parse_args(args=[])
    new_model_args_dict = vars(model_args)

    # SET MODIFICATIONS TO DEFAULT MODEL ARGUMENTS:
    new_model_args_dict['add_geowiki'] = add_geowiki
    new_model_args_dict['add_nigeria'] = add_nigeria
    new_model_args_dict['geowiki_subset'] =  geowiki_subset # 'nigeria', 'neighbours1'

    new_model_args = Namespace(**new_model_args_dict)
    model = STR2MODEL["land_cover"](new_model_args)
    
    return model

def get_dataset_splits(add_geowiki: bool, add_nigeria: bool, geowiki_subset: str='world', S2_features_only: bool=False):

    # Dirty hack, we use landcovermapper class (only supporting lstm model) class to access model splits
    lancovermapper = get_model(add_geowiki, add_nigeria, geowiki_subset)

    train_dataset = lancovermapper.get_dataset(subset="training").as_array(flatten_x=True, S2_features_only=S2_features_only)
    val_dataset = lancovermapper.get_dataset(subset="validation", normalizing_dict=lancovermapper.normalizing_dict).as_array(flatten_x=True, S2_features_only=S2_features_only)
    #test_dataset = lancovermapper.get_dataset(subset="validation", normalizing_dict=lancovermapper.normalizing_dict, evaluating=True).as_array(flatten_x=True, S2_features_only=S2_features_only) 
    test_dataset = lancovermapper.get_dataset(subset="testing", normalizing_dict=lancovermapper.normalizing_dict).as_array(flatten_x=True, S2_features_only=S2_features_only)

    return (
        train_dataset,
        val_dataset,
        test_dataset
    )

In [5]:
# countries_subset = ['Nigeria']
# geowiki_dataset = GeowikiCropHarvestDataset(root=DATA_DIR/"cropharvest", countries_subset=countries_subset)
# geowiki_train, geowiki_val = geowiki_dataset.train_val_split(geowiki_dataset)

def get_dataset(subset: str, normalizing_dict: Optional[Dict] = None, evaluating: bool = False, add_geowiki: bool=True, add_nigeria: bool=True) -> LandTypeClassificationDataset:       
    # Geowiki
    geowiki_dataset = None
    if add_geowiki:
        # Define split
        if subset == 'training':
            geowiki_dataset = geowiki_train
        elif subset == 'validation':
            geowiki_dataset = geowiki_val

    # Nigeria
    nigeria_root_path = DATA_DIR / 'features' / 'nigeria-cropharvest'
    nigeria_dataset = None
    if add_nigeria or (subset == 'testing') or (subset == "validation" and evaluating):
        # We want to define Nigeria dataset in the following not mutually exclusive cases: 1) for training (i.e. when add_nigeria=True);
        # 2) for testing with test split; 3) for testing with validation split during development (i.e. evaluating=True).
        nigeria_dataset = NigeriaCropHarvestDataset(nigeria_root_path, split=subset)      

    return LandTypeClassificationDataset(
        subset=subset,
        include_geowiki=add_geowiki,
        include_nigeria=add_nigeria,
        evaluating=evaluating,
        geowiki_dataset=geowiki_dataset,
        nigeria_dataset=nigeria_dataset,
        normalizing_dict=normalizing_dict,
    )

# Train Random Forest model

 Evaluate on Nigeria test set

### 1. Training only on geowiki

### 1.1 Geowiki world

In [6]:
add_geowiki = True
add_nigeria = False
geowiki_subset = 'world' # if add_geowiki=False this will be ignored
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset)

Found normalizing dict geowiki_normalizing_dict.h5
Loading normalizing dict geowiki_normalizing_dict.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 19808
Total number of files used for training: 19808
Number of model parameters: 25473
Number of instances in Geowiki training set: 19808
Total number of files used for training: 19808
Number of instances in Geowiki validation set: 4953
Total number of files used for validation: 4953
Number of instances in Nigeria testing set: 455


In [7]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.755
RF precision test set: 0.522
RF recall test set: 0.836
RF f1-score test set: 0.643
RF accuracy test set: 0.626


### Only with S2 data

In [8]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset, S2_features_only=True)

Found normalizing dict geowiki_normalizing_dict.h5
Loading normalizing dict geowiki_normalizing_dict.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 19808
Total number of files used for training: 19808
Number of model parameters: 25473
Number of instances in Geowiki training set: 19808
Total number of files used for training: 19808
Number of instances in Geowiki validation set: 4953
Total number of files used for validation: 4953
Number of instances in Nigeria testing set: 455


In [9]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.691
RF precision test set: 0.53
RF recall test set: 0.732
RF f1-score test set: 0.615
RF accuracy test set: 0.631


### 1.2 Geowiki neighbours

In [103]:
add_geowiki = True
add_nigeria = False
geowiki_subset = 'neighbours1' # if add_geowiki=False this will be ignored
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset)

Found normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Loading normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 632
Total number of files used for training: 632
Number of model parameters: 25473
Number of instances in Geowiki training set: 632
Total number of files used for training: 632
Number of instances in Geowiki validation set: 158
Total number of files used for validation: 158
Number of instances in Nigeria testing set: 455


In [105]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.842
RF precision test set: 0.695
RF recall test set: 0.798
RF f1-score test set: 0.743
RF accuracy test set: 0.778


### Only with S2 data

In [107]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset, S2_features_only=True)
print('')
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

Found normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Loading normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 632
Total number of files used for training: 632
Number of model parameters: 25473
Number of instances in Geowiki training set: 632
Total number of files used for training: 632
Number of instances in Geowiki validation set: 158
Total number of files used for validation: 158
Number of instances in Nigeria testing set: 455

RF roc-auc test set: 0.778
RF precision test set: 0.723
RF recall test set: 0.47
RF f1-score test set: 0.57
RF accuracy test set: 0.714


### 1.3 Geowiki nigeria

In [108]:
add_geowiki = True
add_nigeria = False
geowiki_subset = 'nigeria' # if add_geowiki=False this will be ignored
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset)

Found normalizing dict geowiki_normalizing_dict_Nigeria.h5
Loading normalizing dict geowiki_normalizing_dict_Nigeria.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 361
Total number of files used for training: 361
Number of model parameters: 25473
Number of instances in Geowiki training set: 361
Total number of files used for training: 361
Number of instances in Geowiki validation set: 91
Total number of files used for validation: 91
Number of instances in Nigeria testing set: 455


In [110]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.755
RF precision test set: 0.495
RF recall test set: 0.891
RF f1-score test set: 0.637
RF accuracy test set: 0.591


### Only with S2 data

In [111]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset, S2_features_only=True)
print('')
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

Found normalizing dict geowiki_normalizing_dict_Nigeria.h5
Loading normalizing dict geowiki_normalizing_dict_Nigeria.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 361
Total number of files used for training: 361
Number of model parameters: 25473
Number of instances in Geowiki training set: 361
Total number of files used for training: 361
Number of instances in Geowiki validation set: 91
Total number of files used for validation: 91
Number of instances in Nigeria testing set: 455

RF roc-auc test set: 0.606
RF precision test set: 0.471
RF recall test set: 0.574
RF f1-score test set: 0.517
RF accuracy test set: 0.569


### 2.1 Training only with Nigeria train set

In [112]:
add_geowiki = False
add_nigeria = True
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria)

Number of instances in Nigeria training set: 913
Total number of files used for training: 913
Number of model parameters: 25473
Number of instances in Nigeria training set: 913
Total number of files used for training: 913
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 454
Number of instances in Nigeria testing set: 455


In [113]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.911
RF precision test set: 0.774
RF recall test set: 0.787
RF f1-score test set: 0.78
RF accuracy test set: 0.822


### Only with S2 data

In [114]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset, S2_features_only=True) # geowiki subset doesn't matter here
print('')
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

Number of instances in Nigeria training set: 913
Total number of files used for training: 913
Number of model parameters: 25473
Number of instances in Nigeria training set: 913
Total number of files used for training: 913
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 454
Number of instances in Nigeria testing set: 455

RF roc-auc test set: 0.88
RF precision test set: 0.749
RF recall test set: 0.765
RF f1-score test set: 0.757
RF accuracy test set: 0.802


### 3. Training using Nigeria validation set as well

In [60]:
add_geowiki = False
add_nigeria = True
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria)

Number of instances in Nigeria training set: 913
Total number of files used for training: 913
Number of model parameters: 25473
Number of instances in Nigeria training set: 913
Total number of files used for training: 913
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 454
Number of instances in Nigeria testing set: 455


In [63]:
# Normalizing dicts will be the same anyways as they were calculated with all 
X_train_val = np.concatenate((X_train, X_val))
y_train_val = np.concatenate((y_train, y_val))
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_val, y_train_val)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.919
RF precision test set: 0.79
RF recall test set: 0.76
RF f1-score test set: 0.774
RF accuracy test set: 0.822


Using validation data for training seems to worsen the results a bit!

### 4. Train on geowiki and nigeria combined

### 4.1 Geowiki world

In [118]:
add_geowiki = True
add_nigeria = True
geowiki_subset = 'world'
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria)

Found normalizing dict geowiki_normalizing_dict.h5
Loading normalizing dict geowiki_normalizing_dict.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 19808
Number of instances in Nigeria training set: 913
Total number of files used for training: 20721
Number of model parameters: 25473
Number of instances in Geowiki training set: 19808
Number of instances in Nigeria training set: 913
Total number of files used for training: 20721
Number of instances in Geowiki validation set: 4953
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 5407
Number of instances in Nigeria testing set: 455


In [119]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.796
RF precision test set: 0.569
RF recall test set: 0.858
RF f1-score test set: 0.684
RF accuracy test set: 0.681


### Only with S2 data

In [120]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset, S2_features_only=True)
print('')
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

Found normalizing dict geowiki_normalizing_dict.h5
Loading normalizing dict geowiki_normalizing_dict.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 19808
Number of instances in Nigeria training set: 913
Total number of files used for training: 20721
Number of model parameters: 25473
Number of instances in Geowiki training set: 19808
Number of instances in Nigeria training set: 913
Total number of files used for training: 20721
Number of instances in Geowiki validation set: 4953
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 5407
Number of instances in Nigeria testing set: 455

RF roc-auc test set: 0.722
RF precision test set: 0.587
RF recall test set: 0.667
RF f1-score test set: 0.624
RF accuracy test set: 0.677


### 4.2 Geowiki neighbours

In [121]:
add_geowiki = True
add_nigeria = True
geowiki_subset = 'neighbours1'
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset=geowiki_subset)

Found normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Loading normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 632
Number of instances in Nigeria training set: 913
Total number of files used for training: 1545
Number of model parameters: 25473
Number of instances in Geowiki training set: 632
Number of instances in Nigeria training set: 913
Total number of files used for training: 1545
Number of instances in Geowiki validation set: 158
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 612
Number of instances in Nigeria testing set: 455


In [122]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.916
RF precision test set: 0.791
RF recall test set: 0.765
RF f1-score test set: 0.778
RF accuracy test set: 0.824


### Only with S2 data

In [123]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset, S2_features_only=True)
print('')
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

Found normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Loading normalizing dict geowiki_normalizing_dict_Ghana_Togo_Nigeria_Cameroon_Benin.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 632
Number of instances in Nigeria training set: 913
Total number of files used for training: 1545
Number of model parameters: 25473
Number of instances in Geowiki training set: 632
Number of instances in Nigeria training set: 913
Total number of files used for training: 1545
Number of instances in Geowiki validation set: 158
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 612
Number of instances in Nigeria testing set: 455

RF roc-auc test set: 0.887
RF precision test set: 0.764
RF recall test set: 0.743
RF f1-score test set: 0.753
RF accuracy test set: 0.804


### 4.3 Geowiki Nigeria

In [124]:
add_geowiki = True
add_nigeria = True
geowiki_subset = 'nigeria'
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset=geowiki_subset)

Found normalizing dict geowiki_normalizing_dict_Nigeria.h5
Loading normalizing dict geowiki_normalizing_dict_Nigeria.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of model parameters: 25473
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of instances in Geowiki validation set: 91
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 545
Number of instances in Nigeria testing set: 455


In [125]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.915
RF precision test set: 0.782
RF recall test set: 0.825
RF f1-score test set: 0.803
RF accuracy test set: 0.837


### Only with S2 data

In [126]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset, S2_features_only=True)
print('')
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

Found normalizing dict geowiki_normalizing_dict_Nigeria.h5
Loading normalizing dict geowiki_normalizing_dict_Nigeria.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of model parameters: 25473
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of instances in Geowiki validation set: 91
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 545
Number of instances in Nigeria testing set: 455

RF roc-auc test set: 0.887
RF precision test set: 0.755
RF recall test set: 0.809
RF f1-score test set: 0.781
RF accuracy test set: 0.818


### 5. Train on geowiki (nigeria subset -> best) and nigeria combined (S2 channels only) 

In [76]:
add_geowiki = True
add_nigeria = True
geowiki_subset = 'nigeria'
S2_features_only = True
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset=geowiki_subset, S2_features_only=S2_features_only)

Found normalizing dict geowiki_normalizing_dict_Nigeria.h5
Loading normalizing dict geowiki_normalizing_dict_Nigeria.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of model parameters: 25473
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of instances in Geowiki validation set: 91
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 545
Number of instances in Nigeria testing set: 455


In [83]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.755
RF precision test set: 0.495
RF recall test set: 0.891
RF f1-score test set: 0.637
RF accuracy test set: 0.591


### 6.1 Train on Geowiki Nigeria only --> this shows the value of the hand-labelled data

In [84]:
add_geowiki = True
add_nigeria = False
geowiki_subset = 'nigeria'
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset=geowiki_subset)

Found normalizing dict geowiki_normalizing_dict_Nigeria.h5
Loading normalizing dict geowiki_normalizing_dict_Nigeria.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 361
Total number of files used for training: 361
Number of model parameters: 25473
Number of instances in Geowiki training set: 361
Total number of files used for training: 361
Number of instances in Geowiki validation set: 91
Total number of files used for validation: 91
Number of instances in Nigeria testing set: 455


In [85]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.755
RF precision test set: 0.495
RF recall test set: 0.891
RF f1-score test set: 0.637
RF accuracy test set: 0.591


### 6.2 Train on Geowiki Nigeria only (with validation set for training too)

In [88]:
X_train_val = np.concatenate((X_train, X_val))
y_train_val = np.concatenate((y_train, y_val))
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_val, y_train_val)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.763
RF precision test set: 0.5
RF recall test set: 0.891
RF f1-score test set: 0.64
RF accuracy test set: 0.598


### 7. Train on geowiki and nigeria combined + validation set

In [89]:
add_geowiki = True
add_nigeria = True
geowiki_subset = 'nigeria'
(X_train, y_train), (X_val, y_val), (X_test, y_test) = get_dataset_splits(add_geowiki, add_nigeria, geowiki_subset=geowiki_subset)

Found normalizing dict geowiki_normalizing_dict_Nigeria.h5
Loading normalizing dict geowiki_normalizing_dict_Nigeria.h5
Creating Geowiki train split
Creating Geowiki val split
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of model parameters: 25473
Number of instances in Geowiki training set: 361
Number of instances in Nigeria training set: 913
Total number of files used for training: 1274
Number of instances in Geowiki validation set: 91
Number of instances in Nigeria validation set: 454
Total number of files used for validation: 545
Number of instances in Nigeria testing set: 455


In [92]:
X_train_val = np.concatenate((X_train, X_val))
y_train_val = np.concatenate((y_train, y_val))
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_val, y_train_val)
#y_pred = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)[:, 1] # for roc_auc_score
metrics = get_metrics(y_test, y_probs) # results are a bit better than in notebook 15 and 17 possibly due to correction in normalization of test set with training set nd

RF roc-auc test set: 0.923
RF precision test set: 0.799
RF recall test set: 0.825
RF f1-score test set: 0.812
RF accuracy test set: 0.846
