In [24]:
import sys
from pathlib import Path
from argparse import ArgumentParser, Namespace
from collections import Counter
import pickle

import geopandas as gdp
from tqdm import tqdm

sys.path.append("..")

from src.models import STR2MODEL
from src.engineer import GeoWikiEngineer
from src.exporters.sentinel.cloudfree import BANDS

## Initialize model

Adding args=[] makes it work for Jupyter notebook https://stackoverflow.com/questions/30656777/how-to-call-module-written-with-argparse-in-ipython-notebook

In [15]:
# Get model default args
parser = ArgumentParser()
model_args = STR2MODEL["land_cover"].add_model_specific_args(parser).parse_args(args=[]) 
model_args_dict = vars(model_args)

# Change default hyperparameters
model_args_dict['add_togo'] = False
model_args_dict['multi_headed'] = False
model_args_dict['num_classification_layers'] = 1

# Initialize model
model_args = Namespace(**model_args_dict)
model = STR2MODEL["land_cover"](model_args)
model.hparams

Number of geowiki instances in training set: 27947


Namespace(add_geowiki=True, add_togo=False, alpha=10, batch_size=64, data_folder='/home/gajo/code/togo-crop-mask/notebooks/../data', hidden_vector_size=64, learning_rate=0.001, lstm_dropout=0.2, model_base='lstm', multi_headed=False, num_classification_layers=1, num_lstm_layers=1, probability_threshold=0.5, remove_b1_b10=True)

## 1. Labels distribution

### 1a. Geowiki train and validation

Training set

In [16]:
train_loader = model.train_dataloader()
counter = Counter()
for sample in train_loader:
    x, y, weight = sample
    counter.update(y.numpy())

Number of geowiki instances in training set: 27947


In [17]:
cropland_ratio = counter[1.0] / (counter[0.0] + counter[1.0])
print(f'Cropland: {100*cropland_ratio:.1f}%')
print(f'Non-cropland: {100*(1-cropland_ratio):.1f}%')

Cropland: 22.1%
Non-cropland: 77.9%


Validation set


In [18]:
val_loader = model.val_dataloader()
counter = Counter()
for sample in val_loader:
    x, y, weight = sample
    counter.update(y.numpy())

Number of geowiki instances in validation set: 7301


In [19]:
cropland_ratio = counter[1.0] / (counter[0.0] + counter[1.0])
print(f'Cropland: {100*cropland_ratio:.1f}%')
print(f'Non-cropland: {100*(1-cropland_ratio):.1f}%')

Cropland: 21.2%
Non-cropland: 78.8%


### 1b. Nigeria farmlands test set

In [20]:
test_loader = model.test_dataloader()
counter = Counter()
for sample in test_loader:
    x, y, weight = sample
    counter.update(y.numpy())

Evaluating using the Nigeria evaluation dataset!
Number of instances in nigeria_farmlands_v2 test set: 739


In [21]:
cropland_ratio = counter[1.0] / (counter[0.0] + counter[1.0])
print(f'Cropland: {100*cropland_ratio:.1f}%')
print(f'Non-cropland: {100*(1-cropland_ratio):.1f}%')

Cropland: 53.2%
Non-cropland: 46.8%


<font color='red'> **The target dataset has a very different balance of labels, many more cropland labels compared to training and validation set!** </font>

## 2. Data distribution

Mean and std of each bands in a nice plot. Use normalizing dicts. Normalizing dict has 14 bands but my model has only 12. Anyways plot distribution of all bands.

In [34]:
# TODO: write helper function for plot of bands distribution

### 2a. Geowiki train and val

In [32]:
bands_to_remove = ["B1", "B10"] if model.hparams.remove_b1_b10 else []
bands_to_remove

['B1', 'B10']

In [35]:
model.normalizing_dict

{'mean': array([0.19373134, 0.17126042, 0.16084167, 0.16342165, 0.18624573,
        0.25546337, 0.2904783 , 0.27999799, 0.31447817, 0.10164724,
        0.00873111, 0.22869711, 0.15183401, 0.3227123 ]),
 'std': array([0.14992121, 0.15327113, 0.14417395, 0.1638704 , 0.15853429,
        0.14801799, 0.15064628, 0.14358608, 0.14964237, 0.09366979,
        0.02781958, 0.11075664, 0.09509689, 0.24010132])}

### 2b. Nigeria farmlands test set