# Split patchlets into train/cval/test

The notebook:

 * loads the dataframe with the patchlets descriptions
 * splits the patchlets into training, validation, and test according to dataframe

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os 
import json
from pathlib import Path
import logging
from datetime import datetime
from typing import Callable, List, Any
from distutils.dir_util import copy_tree
from concurrent.futures import ProcessPoolExecutor

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import geopandas as gpd

In [3]:
from fs_s3fs import S3FS

### Utils

In [4]:
def multiprocess(process_fun: Callable, arguments: List[Any], max_workers: int = 4) -> List[Any]:
    """
    Executes multiprocessing with tqdm.
    Parameters
    ----------
    process_fun: A function that processes a single item.
    arguments: Arguments with which te function is called.
    max_workers: Max workers for the process pool executor.

    Returns A list of results.
    -------

    """
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(process_fun, arguments), total=len(arguments)))
    return results

# Basic config 

In [5]:
filesystem = S3FS("bucket-name", 
              aws_access_key_id="",
              aws_secret_access_key="",
              region="eu-central-1") 

In [6]:
BASE_S3_PATH = 'data/Lithuania'
METADATA_PATH = os.path.join(BASE_S3_PATH, 'patchlets_meta')
NPZ_FOLDER_LOCAL = '/home/ubuntu/training_npz'
MODEL_DIR = '/home/ubuntu/model'

NPZ_FOLDER_S3 = 'data/Lithuania/patchlets_npz' 
### Train/Test/Validation folders, data should be copied from here locally due to speed 

PATCHLETS_META_FOLDER = 'data/Lithuania/patchlets_meta/'

In [7]:
logging.getLogger('tensorflow').disabled = True
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

### Load patchlets metadata 

In [8]:
gdf = pd.read_csv(filesystem.open(f'{METADATA_PATH}/patchlet_details.csv'))

In [9]:
gdf.head()

Unnamed: 0,chunk,eopatch,patchlet,chunk_pos,timestamp,minimum,maximum,mean,median,std,...,norm_perc99_b3,norm_meanstd_mean_b0,norm_meanstd_mean_b1,norm_meanstd_mean_b2,norm_meanstd_mean_b3,norm_meanstd_std_b0,norm_meanstd_std_b1,norm_meanstd_std_b2,norm_meanstd_std_b3,validation
0,patchlets_field_delineation_0.npz,eopatch-0036,eopatch-0036_0,0,2019-03-22,442,4108,1218.079838,1070.0,441.325803,...,2922.651155,1212.953566,948.126377,960.306575,1804.907503,468.738789,168.731122,260.523992,498.383968,False
1,patchlets_field_delineation_0.npz,eopatch-0036,eopatch-0036_0,1,2019-04-01,359,3699,1284.585728,1098.0,519.615472,...,3431.897393,1406.046835,1065.360873,1056.628138,2334.73776,635.30072,178.551102,304.276842,535.619971,False
2,patchlets_field_delineation_0.npz,eopatch-0036,eopatch-0036_0,2,2019-04-18,509,3953,1449.912735,1209.0,604.895936,...,3431.897393,1406.046835,1065.360873,1056.628138,2334.73776,635.30072,178.551102,304.276842,535.619971,False
3,patchlets_field_delineation_0.npz,eopatch-0036,eopatch-0036_0,3,2019-04-23,578,3975,1499.046513,1247.0,619.099661,...,3431.897393,1406.046835,1065.360873,1056.628138,2334.73776,635.30072,178.551102,304.276842,535.619971,False
4,patchlets_field_delineation_0.npz,eopatch-0036,eopatch-0036_0,4,2019-04-26,610,3707,1409.728043,1179.0,570.944169,...,3431.897393,1406.046835,1065.360873,1056.628138,2334.73776,635.30072,178.551102,304.276842,535.619971,False


In [10]:
gdf.groupby('patchlet')['patchlet'].count().describe()

count    5027.000000
mean       17.090511
std         3.661889
min         3.000000
25%        15.000000
50%        18.000000
75%        20.000000
max        27.000000
Name: patchlet, dtype: float64

In [11]:
len(gdf)/len(gdf.eopatch.unique())

146.86153846153846

### Create train/validation/test datasets

In [12]:
TRAIN_DATASET = gdf[~gdf.validation]
TEST_DATASET = gdf[gdf.validation]

eops = TRAIN_DATASET.eopatch.unique()
validation_eops = np.random.choice(eops, int(len(eops)*0.1))

VALIDATION_DATASET = TRAIN_DATASET[TRAIN_DATASET.eopatch.isin(validation_eops)]
TRAIN_DATASET = TRAIN_DATASET[~TRAIN_DATASET.eopatch.isin(validation_eops)]

# CREATE TRAIN/TEST/VALIDATION/FILES

Bucket should be mounted

In [13]:
# TRAIN_DATASET.to_csv('/home/ubuntu/{PATCHLETS_META_FOLDER}/patchlet_details_train_dataset.csv', index=False)
# VALIDATION_DATASET.to_csv('/home/ubuntu/{PATCHLETS_META_FOLDER}/patchlet_details_validation_dataset.csv', index=False)
# TEST_DATASET.to_csv('/home/ubuntu/{PATCHLETS_META_FOLDER}/patchlet_details_test_dataset.csv', index=False)

In [14]:
TRAIN_DATASET = pd.read_csv(filesystem.open(f'{PATCHLETS_META_FOLDER}/patchlet_details_train_dataset.csv'))
VALIDATION_DATASET = pd.read_csv(filesystem.open(f'{PATCHLETS_META_FOLDER}/patchlet_details_validation_dataset.csv'))
TEST_DATASET = pd.read_csv(filesystem.open(f'{PATCHLETS_META_FOLDER}/patchlet_details_test_dataset.csv'))

In [15]:
def test_train_val_split(chunk):
    if chunk.startswith('patchlets'):
        print(f"Processing chunk: {chunk}")
        data = np.load(os.path.join(NPZ_FOLDER_LOCAL, chunk), allow_pickle=True)

        idxs_train = TRAIN_DATASET[TRAIN_DATASET.chunk == chunk].chunk_pos 
        idxs_test = TEST_DATASET[TEST_DATASET .chunk == chunk].chunk_pos 
        idxs_val = VALIDATION_DATASET[VALIDATION_DATASET.chunk == chunk].chunk_pos  

        train = {}
        for key in data.keys():
            train[key] = data[key][idxs_train]


        test = {}
        for key in data.keys():
            test[key] = data[key][idxs_test]

        val = {}
        for key in data.keys():
            val[key] = data[key][idxs_val]



        np.savez(os.path.join(NPZ_FOLDER_LOCAL, 'train', chunk), **train)
        np.savez(os.path.join(NPZ_FOLDER_LOCAL, 'test', chunk), **test)
        np.savez(os.path.join(NPZ_FOLDER_LOCAL, 'validation', chunk), **val)

### Create test/train/validation NPZ files 

In [16]:
multiprocess(test_train_val_split, os.listdir(NPZ_FOLDER_LOCAL), max_workers=8)

##### Check results with some sanity checks 

In [17]:
train_folder = os.path.join(NPZ_FOLDER_LOCAL, 'train')
val_folder = os.path.join(NPZ_FOLDER_LOCAL, 'validation')
test_folder = os.path.join(NPZ_FOLDER_LOCAL, 'test')

In [18]:
for i in [1, 35]: 
    chunk = f'patchlets_field_delineation_{i}.npz'

    train_chunk = np.load(os.path.join(train_folder, chunk))
    test_chunk = np.load(os.path.join(test_folder, chunk))
    val_chunk = np.load(os.path.join(val_folder, chunk))

    assert len(TRAIN_DATASET[TRAIN_DATASET.chunk == chunk].chunk_pos) == train_chunk['y_boundary'].shape[0]
    assert len(VALIDATION_DATASET[VALIDATION_DATASET.chunk == chunk].chunk_pos) == val_chunk['y_boundary'].shape[0]
    assert len(TEST_DATASET[TEST_DATASET.chunk == chunk].chunk_pos) == test_chunk['y_boundary'].shape[0]

    assert len(TRAIN_DATASET[TRAIN_DATASET.chunk == chunk].chunk_pos) == train_chunk['y_extent'].shape[0]
    assert len(VALIDATION_DATASET[VALIDATION_DATASET.chunk == chunk].chunk_pos) == val_chunk['y_extent'].shape[0]
    assert len(TEST_DATASET[TEST_DATASET.chunk == chunk].chunk_pos) == test_chunk['y_extent'].shape[0]

    assert len(TRAIN_DATASET[TRAIN_DATASET.chunk == chunk].chunk_pos) == train_chunk['y_distance'].shape[0]
    assert len(VALIDATION_DATASET[VALIDATION_DATASET.chunk == chunk].chunk_pos) == val_chunk['y_distance'].shape[0]
    assert len(TEST_DATASET[TEST_DATASET.chunk == chunk].chunk_pos) == test_chunk['y_distance'].shape[0]
    
    assert len(TRAIN_DATASET[TRAIN_DATASET.chunk == chunk].chunk_pos) == train_chunk['X'].shape[0]
    assert len(VALIDATION_DATASET[VALIDATION_DATASET.chunk == chunk].chunk_pos) == val_chunk['X'].shape[0]
    assert len(TEST_DATASET[TEST_DATASET.chunk == chunk].chunk_pos) == test_chunk['X'].shape[0]
    
    print(f"For chunk {chunk} the dataframe and npz files lengths match.")

For chunk patchlets_field_delineation_1.npz the dataframe and npz files lengths match.
For chunk patchlets_field_delineation_35.npz the dataframe and npz files lengths match.
