In [1]:
# Required libraries
import os
import tarfile
import json
from pathlib import Path
from radiant_mlhub.client import _download as download_file

import datetime
import rasterio
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedShuffleSplit

os.environ['MLHUB_API_KEY'] = '3d4d443d2fa0498b2cc1fe278b4936dfbd5e117aa4fe54f193507a6b9e502640'

In [2]:
DOWNLOAD_S1 = False # If you set this to true then the Sentinel-1 data will be downloaded which is not needed in this notebook.

# Select which imagery bands you'd like to download here:
DOWNLOAD_BANDS = {
    'B01': False,
    'B02': False,
    'B03': True,
    'B04': True,
    'B05': False,
    'B06': False,
    'B07': False,
    'B08': True,
    'B8A': False,
    'B09': False,
    'B11': False,
    'B12': False,
    'CLM': True
}

# In this model we will only use Green, Red and NIR bands. You can select to download any number of bands. 
# Our choice relies on the fact that vegetation is most sensitive to these bands. 
# We also donwload the CLM or Cloud Mask layer to exclude cloudy data from the training phase. 
# You can also do a feature selection, and try different combination of bands to see which ones will result in a better accuracy.

In [3]:
FOLDER_BASE = 'ref_south_africa_crops_competition_v1'

def download_archive(archive_name):
    if os.path.exists(archive_name.replace('.tar.gz', '')):
        return
    
    print(f'Downloading {archive_name} ...')
    download_url = f'https://radiant-mlhub.s3.us-west-2.amazonaws.com/archives/{archive_name}'
    download_file(download_url, '.')
    print(f'Extracting {archive_name} ...')
    with tarfile.open(archive_name) as tfile:
        tfile.extractall()
    os.remove(archive_name)

for split in ['train', 'test']:
    # Download the labels
    labels_archive = f'{FOLDER_BASE}_{split}_labels.tar.gz'
    download_archive(labels_archive)
    
    # Download Sentinel-1 data
    if DOWNLOAD_S1:
        s1_archive = f'{FOLDER_BASE}_{split}_source_s1.tar.gz'
        download_archive(s1_archive)
        

    for band, download in DOWNLOAD_BANDS.items():
        if not download:
            continue
        s2_archive = f'{FOLDER_BASE}_{split}_source_s2_{band}.tar.gz'
        download_archive(s2_archive)
        
def resolve_path(base, path):
    return Path(os.path.join(base, path)).resolve()
        
def load_df(collection_id):
    split = collection_id.split('_')[-2]
    collection = json.load(open(f'{collection_id}/collection.json', 'r'))
    rows = []
    item_links = []
    for link in collection['links']:
        if link['rel'] != 'item':
            continue
        item_links.append(link['href'])
        
    for item_link in item_links:
        item_path = f'{collection_id}/{item_link}'
        current_path = os.path.dirname(item_path)
        item = json.load(open(item_path, 'r'))
        tile_id = item['id'].split('_')[-1]
        for asset_key, asset in item['assets'].items():
            rows.append([
                tile_id,
                None,
                None,
                asset_key,
                str(resolve_path(current_path, asset['href']))
            ])
            
        for link in item['links']:
            if link['rel'] != 'source':
                continue
            source_item_id = link['href'].split('/')[-2]
            
            if source_item_id.find('_s1_') > 0 and not DOWNLOAD_S1:
                continue
            elif source_item_id.find('_s1_') > 0:
                for band in ['VV', 'VH']:
                    asset_path = Path(f'{FOLDER_BASE}_{split}_source_s1/{source_item_id}/{band}.tif').resolve()
                    date = '-'.join(source_item_id.split('_')[10:13])
                    
                    rows.append([
                        tile_id,
                        f'{date}T00:00:00Z',
                        's1',
                        band,
                        asset_path
                    ])
                
            if source_item_id.find('_s2_') > 0:
                for band, download in DOWNLOAD_BANDS.items():
                    if not download:
                        continue
                    
                    asset_path = Path(f'{FOLDER_BASE}_{split}_source_s2_{band}/{source_item_id}_{band}.tif').resolve()
                    date = '-'.join(source_item_id.split('_')[10:13])
                    rows.append([
                        tile_id,
                        f'{date}T00:00:00Z',
                        's2',
                        band,
                        asset_path
                    ])
            
    return pd.DataFrame(rows, columns=['tile_id', 'datetime', 'satellite_platform', 'asset', 'file_path'])

competition_train_df = load_df(f'{FOLDER_BASE}_train_labels')
competition_test_df = load_df(f'{FOLDER_BASE}_test_labels')

Downloading ref_south_africa_crops_competition_v1_train_source_s2_B04.tar.gz ...


  0%|          | 0/6363.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B04.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_B08.tar.gz ...


  0%|          | 0/6755.8 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_B08.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_train_source_s2_CLM.tar.gz ...


  0%|          | 0/24.3 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_train_source_s2_CLM.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_labels.tar.gz ...


  0%|          | 0/10.9 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_labels.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B03.tar.gz ...


  0%|          | 0/2454.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B03.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B04.tar.gz ...


  0%|          | 0/2706.0 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B04.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_B08.tar.gz ...


  0%|          | 0/2877.3 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_B08.tar.gz ...
Downloading ref_south_africa_crops_competition_v1_test_source_s2_CLM.tar.gz ...


  0%|          | 0/10.4 [00:00<?, ?M/s]

Extracting ref_south_africa_crops_competition_v1_test_source_s2_CLM.tar.gz ...


In [4]:
competition_train_df

Unnamed: 0,tile_id,datetime,satellite_platform,asset,file_path
0,2587,,,documentation,E:\ML_HUB\ref_south_africa_crops_competition_v...
1,2587,,,field_ids,E:\ML_HUB\ref_south_africa_crops_competition_v...
2,2587,,,field_info_train,E:\ML_HUB\ref_south_africa_crops_competition_v...
3,2587,,,labels,E:\ML_HUB\ref_south_africa_crops_competition_v...
4,2587,,,raster_values,E:\ML_HUB\ref_south_africa_crops_competition_v...
...,...,...,...,...,...
591109,2198,2017-11-27T00:00:00Z,s2,CLM,E:\ML_HUB\ref_south_africa_crops_competition_v...
591110,2198,2017-11-30T00:00:00Z,s2,B03,E:\ML_HUB\ref_south_africa_crops_competition_v...
591111,2198,2017-11-30T00:00:00Z,s2,B04,E:\ML_HUB\ref_south_africa_crops_competition_v...
591112,2198,2017-11-30T00:00:00Z,s2,B08,E:\ML_HUB\ref_south_africa_crops_competition_v...


In [5]:
competition_train_df.to_csv('competition_train.csv', index = False)
# competition_test_df.to_csv('competition_test.csv', index = False)

In [3]:
competition_train_df = pd.read_csv('competition_train.csv')

In [None]:
# import os
  
# shutdown = 'yes'
  
# if shutdown == 'no':
#     exit()
# else:
#     os.system("shutdown /s /t 1")

In [4]:
# This DataFrame lists all types of assets including documentation of the data. 
# In the following, we will use the Sentinel-2 bands as well as labels. 
competition_train_df['asset'].unique()

array(['documentation', 'field_ids', 'field_info_train', 'labels',
       'raster_values', 'B03', 'B04', 'B08', 'CLM'], dtype=object)

In [5]:
tile_ids_train = competition_train_df['tile_id'].unique()

In [11]:
time = competition_train_df['datetime'].unique()
time

array([nan, '2017-04-01T00:00:00Z', '2017-04-11T00:00:00Z',
       '2017-04-21T00:00:00Z', '2017-05-01T00:00:00Z',
       '2017-05-11T00:00:00Z', '2017-05-21T00:00:00Z',
       '2017-05-31T00:00:00Z', '2017-06-10T00:00:00Z',
       '2017-06-20T00:00:00Z', '2017-06-30T00:00:00Z',
       '2017-07-05T00:00:00Z', '2017-07-10T00:00:00Z',
       '2017-07-15T00:00:00Z', '2017-07-20T00:00:00Z',
       '2017-07-25T00:00:00Z', '2017-07-30T00:00:00Z',
       '2017-08-04T00:00:00Z', '2017-08-09T00:00:00Z',
       '2017-08-14T00:00:00Z', '2017-08-19T00:00:00Z',
       '2017-08-24T00:00:00Z', '2017-08-29T00:00:00Z',
       '2017-09-08T00:00:00Z', '2017-09-18T00:00:00Z',
       '2017-09-23T00:00:00Z', '2017-09-28T00:00:00Z',
       '2017-10-03T00:00:00Z', '2017-10-08T00:00:00Z',
       '2017-10-13T00:00:00Z', '2017-10-18T00:00:00Z',
       '2017-10-23T00:00:00Z', '2017-10-28T00:00:00Z',
       '2017-11-02T00:00:00Z', '2017-11-07T00:00:00Z',
       '2017-11-12T00:00:00Z', '2017-11-17T00:00:00Z',
     

In [7]:
# For simplicty of this baseline model, we will use only 5 images throughout the growing season
# You can choose to use all of them, select a few of them at specifc intervals, or 
# load as many as you want and interpolate between them to have a regular temporal frequency.

# Another assumption is that we are selecting the first 5 cloud free images. Ideally, you should
# select the images across the different tiles with the same temporal frequency. 
n_obs = 6

In [28]:
# X = np.empty((0, 3 * n_obs))
# y = np.empty((0, 1))
# field_ids = np.empty((0, 1))
# for tile_id in tile_ids_train:
#     tile_df = competition_train_df[competition_train_df['tile_id']==tile_id]

#     label_src = rasterio.open(tile_df[tile_df['asset']=='labels']['file_path'].values[0])
#     label_array = label_src.read(1)
#     y = np.append(y, label_array.flatten())

#     field_id_src = rasterio.open(tile_df[tile_df['asset']=='field_ids']['file_path'].values[0])
#     field_id_array = field_id_src.read(1)
#     field_ids = np.append(field_ids, field_id_array.flatten())

#     tile_date_times = tile_df[tile_df['datetime']==[date]]

#     X_tile = np.empty((256 * 256, 0))
#     n_X = 0

In [8]:
# Our goal is developing a pixel-based Random Forest model. So we will create an X variable
# that each row is a pixel and each column is one of the observations. 
# The other variables is y which has rows equal to the number of pixels. 
X = np.empty((0, 3 * n_obs))
y = np.empty((0, 1))
field_ids = np.empty((0, 1))

for tile_id in tile_ids_train:
    tile_df = competition_train_df[competition_train_df['tile_id']==tile_id]

    label_src = rasterio.open(tile_df[tile_df['asset']=='labels']['file_path'].values[0])
    label_array = label_src.read(1)
    y = np.append(y, label_array.flatten())

    field_id_src = rasterio.open(tile_df[tile_df['asset']=='field_ids']['file_path'].values[0])
    field_id_array = field_id_src.read(1)
    field_ids = np.append(field_ids, field_id_array.flatten())

    tile_date_times = tile_df[tile_df['satellite_platform']=='s2']['datetime'].unique()

    X_tile = np.empty((256 * 256, 0))
    n_X = 0
    for date_time in tile_date_times:
        # Here we retrieve the cloud band, and check if it's cloud free we will load the other bands
        # Otherwise we will pass on to the next observation
        
        clm_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='CLM')]['file_path'].values[0])
        clm_max = np.max(clm_src.read(1))
        # In the following we select images that the maximum cloud cover probability per pixel is 10% (10% * 255 = 25.5).
        if clm_max < 2.55:
            n_X+=1
            b3_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B03')]['file_path'].values[0])
            b3_array = np.expand_dims(b3_src.read(1).flatten(), axis=1)

            b4_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B04')]['file_path'].values[0])
            b4_array = np.expand_dims(b4_src.read(1).flatten(), axis=1)

            b8_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B08')]['file_path'].values[0])
            b8_array = np.expand_dims(b8_src.read(1).flatten(), axis=1)


            X_tile = np.append(X_tile, b3_array, axis = 1)
            X_tile = np.append(X_tile, b4_array, axis = 1)
            X_tile = np.append(X_tile, b8_array, axis = 1)
        if n_X == n_obs:
            break
        
    X = np.append(X, X_tile, axis=0)

MemoryError: Unable to allocate 20.6 GiB for an array with shape (115081216, 24) and data type float64

In [10]:
data = pd.DataFrame(X)
data['label'] = y.astype(int)
data['field_id'] = field_ids
data = data[data.label != 0] #this filters the pixels that don't have a label (or corresponding field ID)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,label,field_id
2048,29.0,40.0,57.0,29.0,41.0,60.0,24.0,33.0,52.0,27.0,37.0,55.0,20.0,23.0,50.0,2,3020.0
2304,30.0,41.0,57.0,30.0,39.0,58.0,25.0,34.0,51.0,29.0,39.0,57.0,20.0,25.0,53.0,2,3020.0
2560,30.0,43.0,58.0,30.0,43.0,61.0,25.0,36.0,55.0,30.0,41.0,57.0,23.0,26.0,53.0,2,3020.0
2561,32.0,43.0,63.0,30.0,44.0,64.0,24.0,33.0,52.0,30.0,41.0,59.0,20.0,24.0,51.0,2,3020.0
2816,28.0,42.0,58.0,32.0,42.0,60.0,25.0,34.0,53.0,28.0,40.0,58.0,21.0,24.0,52.0,2,3020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173670320,58.0,83.0,104.0,56.0,80.0,95.0,51.0,79.0,101.0,51.0,77.0,99.0,52.0,78.0,99.0,9,73261.0
173670321,61.0,87.0,110.0,57.0,80.0,98.0,55.0,80.0,102.0,53.0,78.0,99.0,57.0,82.0,107.0,9,73261.0
173670322,61.0,86.0,109.0,55.0,79.0,95.0,54.0,78.0,100.0,50.0,73.0,95.0,55.0,81.0,104.0,9,73261.0
173670323,57.0,82.0,103.0,52.0,73.0,92.0,51.0,73.0,95.0,49.0,71.0,93.0,52.0,77.0,99.0,9,73261.0


In [11]:
data.to_csv('X_competition_train4.csv', index = False)

In [12]:
competition_test_df.to_csv('competition_test4.csv', index = False)

## Competition Test Data
In this part we will load the competition test data (which does not have labels) and predict the crop class for each field

In [13]:
tile_ids_test = competition_test_df['tile_id'].unique()

In [14]:
X_competition_test = np.empty((0, 3 * n_obs))
field_ids_test = np.empty((0, 1))

for tile_id in tile_ids_test:
    tile_df = competition_test_df[competition_test_df['tile_id']==tile_id]
    
    field_id_src = rasterio.open(tile_df[tile_df['asset']=='field_ids']['file_path'].values[0])
    field_id_array = field_id_src.read(1)
    field_ids_test = np.append(field_ids_test, field_id_array.flatten())
    
    tile_date_times = tile_df[tile_df['satellite_platform']=='s2']['datetime'].unique()
    
    X_tile = np.empty((256 * 256, 0))
    n_X = 0
    for date_time in tile_date_times:
        # Here we retrieve the cloud band, and check if it's cloud free we will load the other bands
        # Otherwise we will pass on to the next observation
        
        clm_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='CLM')]['file_path'].values[0])
        clm_max = np.max(clm_src.read(1))
        
        if clm_max < 25:
            n_X+=1
            b3_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B03')]['file_path'].values[0])
            b3_array = np.expand_dims(b3_src.read(1).flatten(), axis=1)

            b4_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B04')]['file_path'].values[0])
            b4_array = np.expand_dims(b4_src.read(1).flatten(), axis=1)

            b8_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B08')]['file_path'].values[0])
            b8_array = np.expand_dims(b8_src.read(1).flatten(), axis=1)


            X_tile = np.append(X_tile, b3_array, axis = 1)
            X_tile = np.append(X_tile, b4_array, axis = 1)
            X_tile = np.append(X_tile, b8_array, axis = 1)
        if n_X == n_obs:
            break
        
    X_competition_test = np.append(X_competition_test, X_tile, axis=0)

In [15]:
data_test = pd.DataFrame(X_competition_test)
data_test['field_id'] = field_ids_test
data_test = data_test[data_test.field_id != 0]
data_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,field_id
72,44.0,62.0,84.0,45.0,61.0,82.0,47.0,61.0,85.0,47.0,65.0,86.0,42.0,60.0,81.0,102896.0
73,45.0,62.0,84.0,42.0,60.0,80.0,46.0,65.0,88.0,45.0,63.0,86.0,40.0,57.0,77.0,102896.0
74,41.0,60.0,80.0,42.0,58.0,77.0,44.0,62.0,84.0,44.0,60.0,82.0,38.0,53.0,76.0,102896.0
75,43.0,59.0,80.0,43.0,58.0,78.0,44.0,62.0,82.0,43.0,60.0,81.0,40.0,55.0,75.0,102896.0
76,44.0,61.0,80.0,44.0,60.0,79.0,44.0,61.0,84.0,45.0,62.0,83.0,42.0,58.0,77.0,102896.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74514427,34.0,51.0,70.0,34.0,50.0,73.0,34.0,52.0,74.0,30.0,46.0,72.0,31.0,46.0,75.0,61236.0
74514428,36.0,52.0,74.0,37.0,54.0,77.0,37.0,54.0,77.0,33.0,48.0,73.0,32.0,47.0,74.0,61236.0
74514429,38.0,54.0,73.0,36.0,52.0,76.0,36.0,53.0,78.0,33.0,50.0,72.0,32.0,48.0,74.0,61236.0
74514430,38.0,56.0,76.0,37.0,55.0,77.0,38.0,57.0,77.0,35.0,52.0,74.0,35.0,50.0,75.0,61236.0


In [16]:
data_test.to_csv('X_competition_test4.csv', index = False)

In [4]:
data_test = pd.read_csv('X_competition_test4.csv')

1) https://zindi.africa/competitions/radiant-earth-spot-the-crop-challenge/discussions/7282
2) https://zindi.africa/hackathons/radiant-earth-spot-the-crop-hackathon/discussions/6728
3) https://zindi.africa/competitions/radiant-earth-spot-the-crop-challenge/discussions/7019

In [5]:
data_test_grouped = data_test.groupby('field_id').mean().reset_index()
data_test_grouped

Unnamed: 0,field_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,5.0,37.520223,49.124128,61.799163,37.486750,49.991632,64.218968,33.453278,43.214784,54.471409,27.702929,35.867503,43.847978,19.259414,25.662483,41.065551
1,10.0,21.932465,34.040039,53.137000,21.264351,33.236855,53.280753,20.407622,27.798842,48.596237,17.765557,21.308249,57.735649,18.106609,12.701399,80.054510
2,11.0,43.307474,58.715232,78.382214,39.543992,53.197729,70.063387,45.402081,61.543046,82.907285,44.615894,61.055818,82.386944,44.166509,61.079470,81.649007
3,17.0,41.615789,64.560526,83.273684,42.003947,64.925000,83.109211,38.659211,63.111842,82.563158,42.553947,67.951316,86.764474,41.134211,66.263158,84.692105
4,18.0,22.336842,39.115789,51.084211,20.210526,37.694737,52.736842,19.389474,36.168421,50.284211,19.905263,37.242105,49.715789,18.642105,36.800000,51.757895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35290,122722.0,27.127013,55.822338,73.859740,24.064675,52.218961,69.664416,27.256883,57.305455,76.188312,25.626753,54.829610,72.398442,22.810909,50.195065,65.273506
35291,122724.0,30.241144,41.461853,71.245232,30.137602,41.433243,69.673025,25.702997,34.944142,55.587193,25.925068,34.291553,53.644414,16.153951,20.126703,42.302452
35292,122726.0,34.381078,52.388962,67.998686,35.228647,54.307490,72.021025,34.345598,53.486202,70.006570,26.386334,43.634691,59.350854,26.759527,43.336399,58.001314
35293,122730.0,26.715481,37.815900,57.092050,26.895397,38.217573,58.719665,27.058577,37.100418,56.631799,25.937238,35.652720,54.288703,26.531381,35.748954,52.539749


https://zindi.africa/competitions/radiant-earth-spot-the-crop-challenge/discussions/7306

In [6]:
data_test_grouped.to_csv('data_test4.csv', index = False)