In [12]:
import sklearn
import h5py
import os
import numpy as np
from scipy import signal
from sklearn import ensemble
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

In [13]:
TRAIN_DATASET_PATH = os.path.join("<path/to/data/folder>", "train_dataset.hdf5")
VAL_DATASET_PATH = os.path.join("<path/to/data/folder>", "val_dataset.hdf5")
TEST_DATASET_PATH = os.path.join("<path/to/data/folder>", "test_dataset.hdf5")
MODEL_PATH = "random_forest"

In [None]:
SAMPLES_PER_TILE = 1000
BANDS_COUNT = 10

In [18]:
train_dataset = h5py.File(TRAIN_DATASET_PATH, "r")
val_dataset = h5py.File(VAL_DATASET_PATH, "r")

In [19]:
def extract_features_and_labels(datasets):
    xs, ys = [], []
    for dataset in datasets:
        groups = dataset.groups
        with tqdm(total=len(groups)) as pbar:
            for tile in groups:
                tile_data = dataset[tile]
                optical = tile_data["optical"]

                height, width, _ = optical.shape
                stack = np.zeros((height, width, BANDS_COUNT + 1))
                stack[:, :, 0:6] = optical

                dem = tile_data["dem"]
                stack[:, :, 6] = dem[:, :, 0]
                slope = tile_data["slope"]
                stack[:, :, 7] = slope[:, :, 0]

                dem = tile_data["dem"]
                stack[:, :, 6] = dem[:, :, 0]
                slope = tile_data["slope"]
                stack[:, :, 7] = slope[:, :, 0]

                groundtruth = tile_data["groundtruth"]
                stack[:, :, -1] = groundtruth[:, :, 0]

                stack = stack.reshape((height * width, BANDS_COUNT + 1))
                sample_idxs = np.random.choice(stack.shape[0], size=SAMPLES_PER_TILE, replace=False)
                samples = stack[sample_idxs]

                xs.extend(samples[:, :-1])
                ys.extend(samples[:, -1])
                
                pbar.update(1)

        return xs, ys

In [9]:
# Here, we treat train and val subsets equally as we do not perform any kind of early stopping or hyperparameter optimization
train_x, train_y = extract_features_and_labels([train_dataset, val_dataset])

100%|███████████████████████████████████████████████████████████████| 237/237 [06:58<00:00,  1.76s/it]


In [10]:
classifier = ensemble.RandomForestClassifier(
    n_jobs=-1, 
    verbose=11
)

In [11]:
classifier.fit(train_x, train_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 33.5min finished


RandomForestClassifier(n_jobs=-1, verbose=1)

In [12]:
classifier.score(train_x, train_y)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   14.4s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   49.4s finished


0.9999973839662447

In [13]:
pickle.dump(classifier, open(MODEL_PATH, "wb"))

In [20]:
classifier = pickle.load(open(MODEL_PATH, "rb"))

TODO: prediction on test, performance evaluation, comments

In [10]:
def test_predict(classifier, dataset, tiles, mode=MODE):
    feature_idxs = {
        0: [0, 1, 2, 3, 4, 5],
        1: [0, 1, 2, 3, 4, 5, 6, 7],
        2: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    }[mode]
    feature_count = {
        0: 6,
        1: 8,
        2: 16
    }[mode]
    
    xs, ys = [], []
    with tqdm(total=len(tiles)) as pbar:
        for tile in tiles:
            tile_data = dataset[tile]
            optical = tile_data["optical"]

            height, width, _ = optical.shape
            stack = np.zeros((height, width, feature_count))
            stack[:, :, 0:6] = optical

            if mode > 0:
                dem = tile_data["dem"]
                stack[:, :, 6] = dem[:, :, 0]
                slope = tile_data["slope"]
                stack[:, :, 7] = slope[:, :, 0]

            if mode > 1:
                sar_asc = np.array(tile_data["sar_asc"])
                sar_desc = np.array(tile_data["sar_desc"])

                vv1, vh1, vv2, vh2 = calc_intensity(sar_asc)
                vv, vh = (vv1 + vv2) / 2, (vh1 + vh2) / 2
                stack[:, :, 8] = vv
                stack[:, :, 9] = vh

                vv1, vh1, vv2, vh2 = calc_intensity(sar_desc)
                vv, vh = (vv1 + vv2) / 2, (vh1 + vh2) / 2
                stack[:, :, 10] = vv
                stack[:, :, 11] = vh

                stack[:, :, 12] = calc_coherence(sar_asc, mode=0)
                stack[:, :, 13] = calc_coherence(sar_asc, mode=1)
                stack[:, :, 14] = calc_coherence(sar_desc, mode=0)
                stack[:, :, 15] = calc_coherence(sar_desc, mode=1)

                # coverage = calc_sar_mask(sar_asc, sar_desc)
                # stack[coverage] = 0

            stack = np.nan_to_num(stack)
            stack = stack.reshape((height * width, feature_count))
            
            prediction = classifier.predict(stack)
            prediction = prediction.reshape((height, width))
            
            with h5py.File(os.path.join("predictions", "predictions.hdf5"), "a") as predictions:
                group = predictions[tile]
                group.create_dataset(f"randomforest{MODE}", data=prediction)
            
            pbar.update(1)

In [11]:
test_predict(classifier, dataset, test_tiles, mode=MODE)

  0%|                                                                          | 0/60 [00:00<?, ?it/s][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.3s finished
  2%|█                                                                 | 1/60 [00:03<03:04,  3.13s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.9s finished
  3%|██▏                                                               | 2/60 [00:06<03:21,  3.48s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.2s finished
  5%|███▎                                        

 43%|████████████████████████████▏                                    | 26/60 [01:18<01:24,  2.50s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.6s finished
 45%|█████████████████████████████▎                                   | 27/60 [01:21<01:31,  2.76s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.9s finished
 47%|██████████████████████████████▎                                  | 28/60 [01:24<01:27,  2.74s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.8s finished
 48%|███████████████████████████████▍            

 87%|████████████████████████████████████████████████████████▎        | 52/60 [02:28<00:22,  2.84s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.2s finished
 88%|█████████████████████████████████████████████████████████▍       | 53/60 [02:31<00:20,  2.90s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.2s finished
 90%|██████████████████████████████████████████████████████████▌      | 54/60 [02:34<00:17,  2.93s/it][Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.7s finished
 92%|████████████████████████████████████████████