In [1]:
from pathlib import Path
import h5py

import numpy as np
from sklearn.ensemble import RandomForestClassifier

from cropharvest.datasets import CropHarvest, CropHarvestLabels, Task
from cropharvest.columns import NullableColumns, RequiredColumns
from cropharvest.engineer import Engineer
from cropharvest.bands import BANDS, DYNAMIC_BANDS, STATIC_BANDS, REMOVED_BANDS

In [5]:
DATA_DIR = "../data/cropharvest"

## Get geowiki data

- Load all labels and get all h5 files paths
- Recalculate normalizing dict
- Get as array

In [6]:
class GeowikiCropHarvest():
    def __init__(self, root="data"):    
        self.root = Path(root)
        cropharvest_labels = CropHarvestLabels(root, download=True)
        cropharvest_df = cropharvest_labels.as_geojson()
        self.labels = cropharvest_df[cropharvest_df['dataset'] == 'geowiki-landcover-2017'].reset_index(drop=True)
        self._discard_missing_files()

        self.filepaths = self.labels['path'].tolist()
        self.y_vals = self.labels['is_crop'].tolist()
        self.normalizing_dict = None # TODO: recalculate

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, index: int):
        file = h5py.File(self.filepaths[index], "r")
        return self._normalize(file.get("array")[:]), self.y_vals[index]
    
    def as_array(self, flatten_x=False):
        indices_to_sample = list(range(len(self)))
        X, Y = zip(*[self[i] for i in indices_to_sample])
        X_np, y_np = np.stack(X), np.stack(Y)
        if flatten_x:
            X_np = self._flatten_array(X_np)
        return X_np, y_np

    def _path_from_row(self, row):
        path = self.root / f"features/arrays/{row[RequiredColumns.INDEX]}_{row[RequiredColumns.DATASET]}.h5"
        if not path.exists():
            return None
        return path

    def _discard_missing_files(self):
        self.labels['path'] = self.labels.apply(lambda row: self._path_from_row(row), axis=1)
        self.labels['missing_file'] = self.labels['path'].isna()
        self.labels = self.labels[~self.labels['missing_file']].reset_index(drop=True)

    def _normalize(self, array):
        if not self.normalizing_dict:
            return array
        return (array - self.normalizing_dict["mean"]) / self.normalizing_dict["std"]
            
    @staticmethod
    def _flatten_array(array):
        return array.reshape(array.shape[0], -1)

dataset = GeowikiCropHarvest()    

NotADirectoryError: data should be a directory.

In [4]:
dataset.labels

Unnamed: 0,harvest_date,planting_date,label,classification_label,index,is_crop,lat,lon,dataset,collection_date,export_end_date,externally_contributed_dataset,is_test,geometry,path,missing_file
0,,,,,0,0,-16.547619,46.250000,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (46.25000 -16.54762),data/features/arrays/0_geowiki-landcover-2017.h5,False
1,,,,,1,1,-18.547619,48.250000,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (48.25000 -18.54762),data/features/arrays/1_geowiki-landcover-2017.h5,False
2,,,,,2,0,-21.547619,44.250000,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (44.25000 -21.54762),data/features/arrays/2_geowiki-landcover-2017.h5,False
3,,,,,3,1,-17.547619,45.250000,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (45.25000 -17.54762),data/features/arrays/3_geowiki-landcover-2017.h5,False
4,,,,,4,0,-21.547619,46.250000,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (46.25000 -21.54762),data/features/arrays/4_geowiki-landcover-2017.h5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24756,,,,,35849,1,16.651786,103.550595,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (103.55060 16.65179),data/features/arrays/35849_geowiki-landcover-2...,False
24757,,,,,35850,1,22.651786,84.550595,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (84.55060 22.65179),data/features/arrays/35850_geowiki-landcover-2...,False
24758,,,,,35851,1,10.651786,76.550595,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (76.55060 10.65179),data/features/arrays/35851_geowiki-landcover-2...,False
24759,,,,,35860,0,-0.348214,36.550595,geowiki-landcover-2017,2016-09-30T00:00:00,2017-02-01T00:00:00,False,False,POINT (36.55060 -0.34821),data/features/arrays/35860_geowiki-landcover-2...,False


In [7]:
X, y = dataset.as_array(flatten_x=False)
X_flat, y_flat = dataset.as_array(flatten_x=True)

In [9]:
print(X.shape, y.shape)
X_flat.shape, y_flat.shape

(24761, 12, 18) (24761,)


((24761, 216), (24761,))

## Get Nigeria data

# Train model and evaluate on Nigeria

## 1. Use all data and geowiki labels for training (no validation set)

- Train random forest on it as in demo.ipynb
- Normalizing dict
- Test on Nigeria set (is_test)

In [10]:
model = RandomForestClassifier(random_state=0)
model.fit(X_flat, y_flat)

RandomForestClassifier(random_state=0)

In [11]:
# Predict on training set (just a quick check)
model.predict_proba(X_flat[:10])

array([[0.86, 0.14],
       [0.05, 0.95],
       [0.86, 0.14],
       [0.14, 0.86],
       [0.78, 0.22],
       [0.18, 0.82],
       [0.83, 0.17],
       [0.19, 0.81],
       [0.87, 0.13],
       [0.75, 0.25]])

In [12]:
y_flat[:10]

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 0])