## Use RAMP detected damage to balance tiles

In [1]:
import geopandas as gpd
import pandas as pd
import pathlib
import pystac

In [2]:
import os
os.getcwd()

'/Users/tud500158/Library/Mobile Documents/com~apple~CloudDocs/Documents/Documents - TUD500158/github/AutomatedDamageDetection/notebooks/preprocessing'

In [3]:
homedir = '/Users/tud500158/Library/Mobile Documents/com~apple~CloudDocs/Documents/Documents - TUD500158/'
catalog_path = os.path.join(homedir,'Data/tiles')

In [4]:

def _read_tile_catalog(catalog_path):
    """ Read the tile catalog """
    catalog_path = pathlib.Path(catalog_path)
    catalog_path = catalog_path / "catalog.json"
    return pystac.Catalog.from_file(catalog_path.as_posix())


def _catalog_to_geodataframe(catalog, crs="WGS84"):
    """ Convert STAC catalog to a GeoDataFrame object """
    features = {item.id: item.to_dict() for item in catalog.get_all_items()}
    gdf = gpd.GeoDataFrame.from_features(features.values())
    gdf.index = features.keys()
    for column in gdf.columns:
        if 'datetime' in column:
            gdf[column] = pd.to_datetime(gdf[column])
    gdf = gdf.set_crs(crs)
    return gdf


In [5]:
select_dmg_quantile = 0.9
dmg_px_count_file = os.path.join(homedir,'Data/RAMP/RAMP_tiled/RAMP_tiled_dmg_px_count.csv')

# read tile catalog
catalog = _read_tile_catalog(catalog_path)
tiles = _catalog_to_geodataframe(catalog) # gpd f
tilelist = tiles.index.values.tolist()    # tilenames in list (excluding .tif)
len(tilelist)

# read the dmg-indicator file for all tiles
train_set = gpd.GeoDataFrame()
if dmg_px_count_file is not None: # dmg file: "RAMP_tiled_dmg_px_count .pkl or .csv"
    # tiled_dmg_px_count = pd.read_pickle(dmg_px_count_file)
    tiled_dmg_px_count = pd.read_csv(dmg_px_count_file, index_col=0) 
    # print(tiled_dmg_px_count)

    # select tiles with most damage
    df_mask= tiled_dmg_px_count['Dmg_px_count']>=tiled_dmg_px_count['Dmg_px_count'].quantile( select_dmg_quantile )
    tiles_heavydmg = tiled_dmg_px_count[df_mask]
    tile_nums_heavydmg = tiles_heavydmg['Tile_Num'].values # array with strings 'N'

    for tile_N in tile_nums_heavydmg:
        # tile_N = 'tile_' + tile_N   # add prefix to string to differentiate number '10' from '100' and '110' etc
        tile_N = 'tile_' + str(tile_N)
        corresponding_file = [file  for file in tilelist if file.endswith(tile_N) ]

        # add to train_set
        train_set = pd.concat([train_set, tiles.loc[corresponding_file]])
print(len(train_set) )
train_set_1 = train_set;
dmg_tiles_1 = tile_nums_heavydmg

32


## Exclude Region from whole training workflow, set apart for testing at final stage

In [6]:
# tileNums_test_ROI = [112,122,126,139,140,141,142,143,151,152,153,154]
tileNums_test_ROI = [112,122,123,124,125,126,139,140,141,142,143,151,152,153,154]


#### remove ROI from tilelist that is used to split training/val/test sets

In [7]:
tilelist

# find ROI tiles in tilelist
tilelist_ROI = [item for item in tilelist if int(item.split('_')[-1]) in tileNums_test_ROI]
# make SET from ROI tiles
ROI_set = tiles.loc[tilelist_ROI]
# remove these from tilelist
tiles_set = tiles.index.difference(tilelist_ROI) 
tiles_set = tiles.loc[tiles_set]
tiles_set

Unnamed: 0,geometry,start_datetime,end_datetime,datetime
S2_composite_2019-11-1_2020-3-1_tile_0,"POLYGON ((-63.43559 -64.67813, -61.18918 -64.1...",2019-11-01,2020-03-01,NaT
S2_composite_2019-11-1_2020-3-1_tile_1,"POLYGON ((-61.18988 -64.16869, -59.03628 -63.6...",2019-11-01,2020-03-01,NaT
S2_composite_2019-11-1_2020-3-1_tile_10,"POLYGON ((-60.94604 -66.62668, -58.57037 -66.0...",2019-11-01,2020-03-01,NaT
S2_composite_2019-11-1_2020-3-1_tile_100,"POLYGON ((-53.13093 -78.53210, -48.81395 -77.8...",2019-11-01,2020-03-01,NaT
S2_composite_2019-11-1_2020-3-1_tile_101,"POLYGON ((-48.81493 -77.81454, -45.00000 -77.0...",2019-11-01,2020-03-01,NaT
...,...,...,...,...
S2_composite_2019-11-1_2020-3-1_tile_95,"POLYGON ((-82.87520 -80.74381, -75.96362 -80.5...",2019-11-01,2020-03-01,NaT
S2_composite_2019-11-1_2020-3-1_tile_96,"POLYGON ((-75.96416 -80.53354, -69.44402 -80.1...",2019-11-01,2020-03-01,NaT
S2_composite_2019-11-1_2020-3-1_tile_97,"POLYGON ((-69.44452 -80.19327, -63.43472 -79.7...",2019-11-01,2020-03-01,NaT
S2_composite_2019-11-1_2020-3-1_tile_98,"POLYGON ((-63.43564 -79.73618, -57.99457 -79.1...",2019-11-01,2020-03-01,NaT


#### remove ROI frm dmg file that is used for balancing

In [8]:

tileNums_test_ROI = [112,122,123,124,125,126,139,140,141,142,143,151,152,153,154]

# ROI_tiles = [item for item in  tiled_dmg_px_count['Tile_Num'].values if item in tileNums_test_ROI ]
# avail_tiles = [item for item in  tiled_dmg_px_count['Tile_Num'].values if item not in tileNums_test_ROI ]


# type(tiled_dmg_px_count)
ROI_tiles = [True if item not in tileNums_test_ROI else False for item in tiled_dmg_px_count['Tile_Num'].values] # TRUE if tile can be kept, FALSE if
new_df = tiled_dmg_px_count[ROI_tiles]

# ROI_tiles = [True if item in tileNums_test_ROI else False for item in tiled_dmg_px_count['Tile_Num'].values] # TRUE of item in ROI (tile should be removed), else FALSE (then tile can be kept)
# new_df = tiled_dmg_px_count[~ROI_tiles] # kanniet met list.
new_df

Unnamed: 0,Tile_Num,Dmg_px_count
RAMP_mosaic_100m_tile_0.tif,0,390
RAMP_mosaic_100m_tile_100.tif,100,29
RAMP_mosaic_100m_tile_101.tif,101,182
RAMP_mosaic_100m_tile_102.tif,102,1532
RAMP_mosaic_100m_tile_103.tif,103,731
...,...,...
RAMP_mosaic_100m_tile_96.tif,96,100
RAMP_mosaic_100m_tile_97.tif,97,83
RAMP_mosaic_100m_tile_98.tif,98,104
RAMP_mosaic_100m_tile_99.tif,99,145


### Apply in workflow for tiles.py

In [9]:


select_dmg_quantile = 0.9
dmg_px_count_file = os.path.join(homedir,'Data/RAMP/RAMP_tiled/RAMP_tiled_dmg_px_count.csv')

# read tile catalog
catalog = _read_tile_catalog(catalog_path)
tiles = _catalog_to_geodataframe(catalog) # gpd f
tilelist = tiles.index.values.tolist()    # tilenames in list (excluding .tif)

# remove ROI tiles from tilelist, before dividing training/test/val
tileNums_test_ROI = [112,122,123,124,125,126,139,140,141,142,143,151,152,153,154]
tilelist_ROI = [item for item in tilelist if int(item.split('_')[-1]) in tileNums_test_ROI]
# make SET from ROI tiles
ROI_set = tiles.loc[tilelist_ROI]
# remove these from tilelist
tiles_set = tiles.index.difference(tilelist_ROI) 
tiles_set = tiles.loc[tiles_set]
tiles = tiles_set

# read the dmg-indicator file for all tiles
train_set = gpd.GeoDataFrame()
if dmg_px_count_file is not None: # dmg file: "RAMP_tiled_dmg_px_count .pkl or .csv"
    # tiled_dmg_px_count = pd.read_pickle(dmg_px_count_file)
    tiled_dmg_px_count = pd.read_csv(dmg_px_count_file, index_col=0) 
    print(tiled_dmg_px_count)

    # remove tiles that are in test_ROI also from DMG count
    ROI_tiles = [True if item not in tileNums_test_ROI else False for item in tiled_dmg_px_count['Tile_Num'].values] # TRUE if tile can be kept, FALSE if
    tiled_dmg_px_count = tiled_dmg_px_count[ROI_tiles]
    
    # select tiles with most damage
    df_mask= tiled_dmg_px_count['Dmg_px_count']>=tiled_dmg_px_count['Dmg_px_count'].quantile( select_dmg_quantile )
    tiles_heavydmg = tiled_dmg_px_count[df_mask]
    tile_nums_heavydmg = tiles_heavydmg['Tile_Num'].values # array with strings 'N'

    for tile_N in tile_nums_heavydmg:
        # tile_N = 'tile_' + tile_N   # add prefix to string to differentiate number '10' from '100' and '110' etc
        tile_N = 'tile_' + str(tile_N)
        corresponding_file = [file  for file in tilelist if file.endswith(tile_N) ]

        # add to train_set
        train_set = pd.concat([train_set, tiles.loc[corresponding_file]])
        
train_set_2=train_set
dmg_tiles_2 = tile_nums_heavydmg
print(len(train_set_2))

                               Tile_Num  Dmg_px_count
RAMP_mosaic_100m_tile_0.tif           0           390
RAMP_mosaic_100m_tile_100.tif       100            29
RAMP_mosaic_100m_tile_101.tif       101           182
RAMP_mosaic_100m_tile_102.tif       102          1532
RAMP_mosaic_100m_tile_103.tif       103           731
...                                 ...           ...
RAMP_mosaic_100m_tile_96.tif         96           100
RAMP_mosaic_100m_tile_97.tif         97            83
RAMP_mosaic_100m_tile_98.tif         98           104
RAMP_mosaic_100m_tile_99.tif         99           145
RAMP_mosaic_100m_tile_9.tif           9           339

[313 rows x 2 columns]
30


In [10]:
dmg_tiles_2

array([102, 110, 114, 172, 182, 204, 205, 206, 209, 214, 228, 238, 247,
       250, 261, 268, 273, 282, 285,  28, 291, 292, 299, 301, 302, 307,
        50,  53,  68,  93])

In [11]:
## item in A but not in B:
difference = list(set(dmg_tiles_1) - set(dmg_tiles_2)) 
print('Tiles removed from balanced training-set as they were in testROI: \n{}'.format(difference))

difference = list(set(train_set_1.index.values) - set(train_set_2.index.values)) 
print('Tiles removed from balanced training-set as they were in testROI: \n{}'.format(difference))


Tiles removed from balanced training-set as they were in testROI: 
[139, 140, 142, 123, 124]
Tiles removed from balanced training-set as they were in testROI: 
['S2_composite_2019-11-1_2020-3-1_tile_123', 'S2_composite_2019-11-1_2020-3-1_tile_142', 'S2_composite_2019-11-1_2020-3-1_tile_140', 'S2_composite_2019-11-1_2020-3-1_tile_139', 'S2_composite_2019-11-1_2020-3-1_tile_124']


In [12]:
tileNums_test_ROI

[112, 122, 123, 124, 125, 126, 139, 140, 141, 142, 143, 151, 152, 153, 154]