In [2]:
%load_ext autoreload
%autoreload 2

In [62]:
# Built-in modules
import os
import gzip
import shutil
from pathlib import Path
from datetime import datetime, timedelta

# Basics of Python data handling and visualization
import tqdm
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt

# Imports from eo-learn and sentinelhub-py
from eolearn.core import EOPatch, EOTask, LinearWorkflow, FeatureType

# Visualisation utils
from air_quality_and_health_challenge.utils import (get_extent, 
                   draw_outline, 
                   draw_bbox, 
                   draw_feature, 
                   draw_true_color,
                   unzip_file,
                   load_tiffs,
                   days_to_datetimes,
                   datetimes_to_days,
                   reproject_tiff,
                   upscale_tiff,
                   mask_tiff)

from lib.utils import _get_point
from configs.utils import load_config
from lib.train_utils import get_data, load_pickle, save_pickle
from algorithms.ensemble import Ensemble

In [63]:
TRAIN_DIR = Path("./data/train")

aoi = AOI = "Italy"
LABEL = "NO2"

WDIR = Path("./wdir") / AOI
os.makedirs(WDIR, exist_ok=True)

In [103]:
from configs.utils import load_config
configs = {}
configs["pm25_italy"] = load_config("configs/pm25_italy.yaml")
configs["pm25_california"] = load_config("configs/pm25_california.yaml")
configs["pm25_southafrica"] = load_config("configs/pm25_southafrica.yaml")
configs["no2_italy"] = load_config("configs/no2_italy.yaml")
configs["no2_california"] = load_config("configs/no2_california.yaml")
configs["no2_southafrica"] = load_config("configs/no2_southafrica.yaml")

config = configs[("no2" if LABEL == "NO2" else "pm25") + "_" + AOI.lower().replace("_", "")]
config

{'cache_path': 'wdir/no2_italy/cache.pkl',
 'weights_path': 'wdir/no2_italy/weights.pkl',
 'aoi': 'Italy',
 'label': 'NO2',
 'indim': 8,
 'outdim': 1,
 'feature_keys': ['no2_native',
  'uv_native',
  'no2_target',
  'uv_target',
  'modis_native',
  'elevation_native',
  'modis_target',
  'elevation_target'],
 'models': {'model1': {'sizes': [32, 32],
   'lr': 1e-05,
   'epochs': 1024,
   'batch_size': 32,
   'dropout': 0.15,
   'use_swish': False,
   'use_batch_norm': False,
   'patience': 30}}}

In [65]:
#### GT
dir = TRAIN_DIR/AOI/"ground_air_quality"/("NO2" if LABEL == "NO2" else "PM25")
gt_path = dir / (os.listdir(dir)[0][:-3] + 'shp')
gt_df = gpd.read_file(gt_path)
gt_df.head()

Unnamed: 0,Date,SITE_LATIT,SITE_LONGI,AirQuality,geometry
0,2336,45.5829,8.83449,56.2,POINT (8.83449 45.58290)
1,2314,45.5829,8.83449,37.7,POINT (8.83449 45.58290)
2,2329,45.5829,8.83449,43.2,POINT (8.83449 45.58290)
3,2294,45.5829,8.83449,23.2,POINT (8.83449 45.58290)
4,2295,45.5829,8.83449,31.3,POINT (8.83449 45.58290)


In [66]:
outdir = WDIR
indir  = TRAIN_DIR / AOI
data, gts, target_size = get_data(indir, outdir, LABEL, aoi, config["feature_keys"])

  and should_run_async(code)
100%|██████████| 87/87 [00:00<00:00, 571.28it/s]


In [67]:
# Connect the day_data to GT
dataset = { "X": [],
            "Y": [],
            "gt": [],
            "native_Y": [],
            "target_Y": [],
            "lat": [], 
            "lon": [],
            "coords": [],
            "date": []}


in_eop = data['data'][0]['s5p' if LABEL == "NO2" else 'cams']
grid = np.zeros(target_size)
for date, lat, lon, gt in tqdm.tqdm(gts[["Date", "SITE_LATIT", "SITE_LONGI", "AirQuality"]].values):
    y_ind, x_ind = _get_point(lat, lon, grid, in_eop.bbox)
    #print(y_ind, x_ind, lat, lon, grid.shape, in_eop.bbox)
    
    try:
        ind = data["date"].index(int(date))
    except:
        ind = None
        #print("Error on date:", date)
        
    if ind:
        native = data['feat_dicts'][ind]['no2_native' if LABEL == 'NO2' else 'pm25_native']
        target = data['feat_dicts'][ind]['no2_target' if LABEL == 'NO2' else 'pm25_target']

        #print(ind, len)
        if np.isnan(native[y_ind, x_ind]):
            continue

        dataset["X"].append(data['feats'][ind][y_ind, x_ind])
        dataset["native_Y"].append(native[y_ind, x_ind])
        dataset["target_Y"].append(target[y_ind, x_ind])
        dataset["gt"].append(gt)
        dataset["lat"].append(lat)
        dataset["lon"].append(lon)
        dataset["coords"].append((lat, lon))
        dataset["date"].append(date)

        
dataset = {k: np.array(v) for k, v in dataset.items()}

100%|██████████| 7636/7636 [00:00<00:00, 24646.92it/s]


In [68]:
gt, native_Y = dataset["gt"], dataset["native_Y"]
# Filter any very off GT

if LABEL == "NO2":
    dataset["Y"] = gt / (6.02214 * 1e4 * 1.9125) - native_Y
    v1 = gt
    v2 = native_Y * 6.02214 * 1e4 * 1.9125
    m = np.min([v2, v1], axis=0)
    r = (v1 / v2)
    if "Africa" in aoi:
        mask =  (r < 3.5) & (r > 0.7)
    elif "California" == aoi:
        mask =  (r < 4.5) & (r > 0.7)
    elif "Italy" == aoi:
        mask =  (r < 2.5) & (r > 0.85)
    
else:
    dataset["Y"] = gt - native_Y
    v1 = gt
    v2 = native_Y
    if "Africa" in aoi:
        m = np.min([v2, v1], axis=0)
        mask = ~((abs(v2 - v1) > (m * 3)) & ((v2 > 1) | (v1 > 1)))
    elif aoi == "Italy":
        mask = ~((abs(v2 - v1) > v2 * 0.9) & ((v2 > 5) | (v1 > 5)))
    elif aoi == "California":
        mask = ~((abs(v2 - v1) > v2 * 0.535) & ((v2 > 5) | (v1 > 5)))

print(mask.mean())
print(np.corrcoef(gt[mask], native_Y[mask]))

0.38431771894093686
[[1.         0.77217065]
 [0.77217065 1.        ]]


In [69]:
X = dataset["X"][mask]
Y = dataset["Y"][mask]
coords = dataset["coords"][mask]
X.shape

(1887, 8)

In [70]:
mu = np.nanmean(data["feats"], axis=(0, 1, 2))
sigma = np.nanstd(data["feats"], axis=(0, 1, 2))

In [104]:
from algorithms.ensemble import Ensemble
ensemble = Ensemble(config, indir, outdir)

In [105]:
ensemble.train(X, Y, coords=coords, mu=mu, sigma=sigma)

train_X.shape: (1619, 8)
test_X.shape: (268, 8)

Epoch.0
loss : 0.24189029049639607
eval_score : -0.09523940831422806

Epoch.1
loss : 0.272285814232686
eval_score : -0.08975501358509064

Epoch.2
loss : 0.23367147060001597
eval_score : -0.0848962813615799

Epoch.3
loss : 0.22404139985640845
eval_score : -0.08076785504817963

Epoch.4
loss : 0.25175044758647097
eval_score : -0.07646867632865906

Epoch.5
loss : 0.22949729538431354
eval_score : -0.07286487519741058

Epoch.6
loss : 0.21214444292526619
eval_score : -0.06966985017061234

Epoch.7
loss : 0.19665317398075963
eval_score : -0.06650788336992264

Epoch.8
loss : 0.2262857867514386
eval_score : -0.06288310140371323

Epoch.9
loss : 0.21533176638916426
eval_score : -0.05954785645008087

Epoch.10
loss : 0.2178912763209904
eval_score : -0.05637573078274727

Epoch.11
loss : 0.19086863042092791
eval_score : -0.053704533725976944

Epoch.12
loss : 0.18123445820574666
eval_score : -0.05112972483038902

Epoch.13
loss : 0.18657679169201383
eval_s

In [99]:
X

array([[ 8.4739266e-05, -7.2890210e-01,  9.0698712e-05, ...,
         2.1840312e+02,  9.3637079e-02,  2.1445175e+02],
       [ 1.6733103e-04, -1.6459069e-01,  1.8723046e-04, ...,
         2.1840312e+02,  9.3637079e-02,  2.1445175e+02],
       [ 2.3314869e-04, -2.4473157e+00,  2.2425773e-04, ...,
         2.1840312e+02,  9.3637079e-02,  2.1445175e+02],
       ...,
       [ 3.3292937e-04, -3.2222047e-01,  4.0296328e-04, ...,
         4.4866581e+01,            nan,  5.1388309e+01],
       [ 3.4398041e-04, -6.5851086e-01,  3.3271342e-04, ...,
         4.0567768e+01,  1.1277722e-01,  3.7246365e+01],
       [ 2.0688381e-04, -2.5254294e-01,  1.9012607e-04, ...,
         4.0567768e+01,            nan,  3.7246365e+01]], dtype=float32)

In [100]:
ensemble.models

{'model2': <algorithms.mlp.MLP at 0x7f9833068bb0>}

In [101]:
ensemble.models['model2'].predict(X[:50])

array([ 2.54528481e-04,  2.54528481e-04,  8.63688765e-05,  2.74433492e-04,
        2.54528481e-04,  2.44645809e-04,  2.44922703e-04, -3.84171843e-04,
        3.63522937e-04,  2.54528481e-04,  2.50500540e-04,  2.54528481e-04,
        1.21522724e-04,  5.40584209e-04,  3.94080475e-04,  1.81082578e-04,
        2.41228874e-04,  2.54528481e-04,  2.54528481e-04,  3.64534673e-04,
        7.66860321e-05,  2.15260545e-04,  2.54552171e-04,  2.54528481e-04,
        2.54528481e-04,  2.54528481e-04,  6.55508920e-05,  2.45064177e-04,
        2.49906705e-04,  1.48792271e-04,  2.13358086e-04,  2.18410962e-04,
        3.92034155e-04,  2.20137576e-04,  4.83961136e-04,  2.28665071e-04,
        1.99967209e-04,  2.54528481e-04,  2.54528481e-04,  1.88674021e-04,
       -1.30394741e-03,  4.52249107e-04,  2.54528481e-04,  2.54528481e-04,
        2.54528481e-04,  2.54528481e-04,  8.10538186e-05,  2.54528481e-04,
        2.53841077e-04,  2.54528481e-04], dtype=float32)

In [102]:
Y[:50]

array([ 1.16696087e-04,  1.04433043e-04,  7.94234088e-05,  1.63076675e-04,
        1.57088255e-04,  4.99271896e-05,  1.90827375e-04,  7.29686702e-06,
        1.12587748e-04,  1.85497064e-04,  4.80882228e-05,  1.49299833e-04,
        7.83762938e-05,  8.81026597e-05,  2.37251305e-04,  3.67028684e-04,
        9.20865789e-05,  1.52651890e-04,  2.11921181e-04,  2.87094666e-04,
        5.80049069e-05, -6.15895870e-06,  8.44926769e-05,  1.55484014e-04,
        1.02935730e-04,  1.70877790e-04,  8.37842641e-05,  1.02263722e-04,
        1.42195349e-04, -3.81691899e-05,  4.41629409e-05,  3.31347576e-05,
        7.25770762e-05,  1.40143156e-04,  8.87593428e-05,  7.37996927e-05,
        5.02876572e-05,  1.77224876e-04,  1.04123999e-04,  9.12677057e-05,
       -1.11076622e-05,  2.25789545e-04,  8.28666573e-05,  1.17313399e-04,
        2.20033097e-04,  1.78899059e-04,  1.04950109e-04,  1.03410861e-04,
        1.52872117e-04,  1.45394696e-04])

In [None]:
ensemble.save()