From afe8b9b2c11f10d6ca123b98dca0d64139e8aaba Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 1 Mar 2022 10:09:30 +1100 Subject: [PATCH 01/33] write cleaned and dropped when we are actually dropping --- scripts/intersect_rasters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index f1fa77f0..917599bc 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -185,11 +185,12 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] - df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) - print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") if df5.shape[0]: + df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) + print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) else: + print(f"No points dropped and there for _cleaned.shp file is not createed'") print(f"No points dropped and there for _cleaned_dropped.shp file is not created") # rets = Parallel( From ba59a2712d11c1f9dc971d598d95bade6cded34c Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Wed, 2 Mar 2022 08:27:30 +1100 Subject: [PATCH 02/33] drop duplicate values during intersect --- scripts/intersect_rasters.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index 917599bc..31e7fafe 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -125,10 +125,10 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count')) pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean() # import IPython; IPython.embed(); import sys; sys.exit() - pts_deduped = pts_mean.merge( - pts_count, how='inner', on=['rows', 'cols']).merge( - pts[['rows', 'cols', 'geometry']], how='inner', on=['rows', 'cols'] - ) + pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) + + pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'], + pts_deduped['POINT_Y'])) # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols] coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry]) else: From 49623bd213b41b492abadee1ba4e905ffced2871 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 8 Mar 2022 17:24:09 +1100 Subject: [PATCH 03/33] geom cols are inserted even without dedupe --- scripts/intersect_rasters.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index 31e7fafe..d983228d 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -30,7 +30,7 @@ def generate_key_val(gl, shorts): shorts[eight_char_name] += 1 else: shorts[eight_char_name] += 1 - return Path(gl).name, eight_char_name + str(shorts[eight_char_name]) + return gl, eight_char_name + str(shorts[eight_char_name]) # import IPython; IPython.embed(); import sys; sys.exit() @@ -92,11 +92,11 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = print("====================================\n", f"intersecting {shp.as_posix()}") pts = gpd.read_file(shp) coords = np.array([(p.x, p.y) for p in pts.geometry]) + geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) + pts = pts.merge(geom, left_index=True, right_index=True) if dedupe: - geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) - pts = pts.merge(geom, left_index=True, right_index=True) tif_name = list(geotifs.keys())[0] - tif = data_location.joinpath(tif_name) + tif = Path(tif_name) orig_cols = pts.columns with rasterio.open(tif) as src: # resample data to target shape @@ -135,12 +135,12 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = pts_deduped = pts coords_deduped = coords - for k, v in geotifs.items(): - print(f"adding {k} to output dataframe") - with rasterio.open(data_location.joinpath(k)) as src: + for i, (k, v) in enumerate(geotifs.items()): + print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + with rasterio.open(Path(k)) as src: pts_deduped[v] = [x[0] for x in src.sample(coords_deduped)] - pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) + # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) output_dir = Path('out_resampled') output_dir.mkdir(exist_ok=True, parents=True) out_shp = output_dir.joinpath(shp.name) @@ -152,7 +152,6 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = if __name__ == '__main__': # local - data_location = Path("configs/data/sirsam") # tif_local = data_location.joinpath('LATITUDE_GRID1.tif') shapefile_location = Path("configs/data") shp = shapefile_location.joinpath('geochem_sites.shp') @@ -171,15 +170,19 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = k, v = generate_key_val(gl, shorts) geotifs[k] = v - for k, v in geotifs.items(): - print(f"\"{k}\": \"{v}\"") - # check all required files are available on disc for k, v in geotifs.items(): print(f"checking if {k} exists") - assert data_location.joinpath(k).exists() + assert Path(k).exists() + + print("Add under the 'intersected_features:' section in yaml") + print('='*100) + for k, v in geotifs.items(): + print(f"\"{Path(k).name}\": \"{v}\"") + + print('='*100) - out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=True) + out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=False) df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix()) df3 = df2[list(geotifs.values())] df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] From 70a03b0c7dbb0245a1c6b5c5e857117f4ac88fda Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Thu, 10 Mar 2022 16:31:12 +1100 Subject: [PATCH 04/33] write another wrapper function including clean --- scripts/intersect_rasters.py | 45 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index d983228d..13364d5d 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -150,11 +150,29 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = return out_shp +def intersect_sample_and_clean(shp, dedupe: bool = False): + out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe) + df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix()) + df3 = df2[list(geotifs.values())] + df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] + df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] + try: + if df5.shape[0]: + df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) + print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") + df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) + else: + print(f"No points dropped and there for _cleaned.shp file is not createed'") + print(f"No points dropped and there for _cleaned_dropped.shp file is not created") + except: + print(f"Check this shapefile {shp}") + + if __name__ == '__main__': # local # tif_local = data_location.joinpath('LATITUDE_GRID1.tif') shapefile_location = Path("configs/data") - shp = shapefile_location.joinpath('geochem_sites.shp') + # shp = shapefile_location.joinpath('geochem_sites.shp') downscale_factor = 2 # keep 1 point in a 2x2 cell @@ -181,22 +199,9 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = print(f"\"{Path(k).name}\": \"{v}\"") print('='*100) - - out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=False) - df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix()) - df3 = df2[list(geotifs.values())] - df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] - df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] - - if df5.shape[0]: - df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) - print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") - df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) - else: - print(f"No points dropped and there for _cleaned.shp file is not createed'") - print(f"No points dropped and there for _cleaned_dropped.shp file is not created") - -# rets = Parallel( - # n_jobs=-1, - # verbose=100, - # )(delayed(intersect_and_sample_shp)(s) for s in shapefile_location.glob("*.shp")) + for s in shapefile_location.glob("*.shp"): + intersect_sample_and_clean(s, True) + # rets = Parallel( + # n_jobs=-1, + # verbose=100, + # )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp")) From 0f5a456cbcfd7661374b62109050fddf651c9878 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Thu, 10 Mar 2022 17:11:03 +1100 Subject: [PATCH 05/33] change to single shapefile multiple raster intersect --- scripts/intersect_rasters.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index 13364d5d..d421c880 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -135,10 +135,16 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = pts_deduped = pts coords_deduped = coords - for i, (k, v) in enumerate(geotifs.items()): - print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") - with rasterio.open(Path(k)) as src: - pts_deduped[v] = [x[0] for x in src.sample(coords_deduped)] + # for k, v in geotifs.items(): + # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + # pts_deduped[v] = __extract_raster_points(coords_deduped, k) + pts_deduped__ = Parallel( + n_jobs=-1, + verbose=100, + )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys()) + + for i, col_name in enumerate(geotifs.values()): + pts_deduped[col_name] = pts_deduped__[i] # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) output_dir = Path('out_resampled') @@ -150,6 +156,12 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = return out_shp +def __extract_raster_points(raster, coords_deduped): + with rasterio.open(Path(raster)) as src: + print(f"---- intersecting {raster}--------") + return [x[0] for x in src.sample(coords_deduped)] + + def intersect_sample_and_clean(shp, dedupe: bool = False): out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe) df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix()) From e828de1620d833821bcb91b77c75a43f10275819 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Sat, 19 Mar 2022 14:25:22 +1100 Subject: [PATCH 06/33] default to only used shapes --- scripts/intersect_rasters.py | 48 +++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index d421c880..0aef9989 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -88,7 +88,7 @@ def generate_key_val(gl, shorts): # } -def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False): +def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False, intersect_parallel: bool = False): print("====================================\n", f"intersecting {shp.as_posix()}") pts = gpd.read_file(shp) coords = np.array([(p.x, p.y) for p in pts.geometry]) @@ -138,22 +138,22 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = # for k, v in geotifs.items(): # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") # pts_deduped[v] = __extract_raster_points(coords_deduped, k) - pts_deduped__ = Parallel( - n_jobs=-1, - verbose=100, - )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys()) - - for i, col_name in enumerate(geotifs.values()): - pts_deduped[col_name] = pts_deduped__[i] + if intersect_parallel: + pts_deduped__ = Parallel( + n_jobs=-1, + verbose=100, + )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys()) + + for i, col_name in enumerate(geotifs.values()): + pts_deduped[col_name] = pts_deduped__[i] + else: + for i, (k, v) in enumerate(geotifs.items()): + print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + pts_deduped[v] = __extract_raster_points(coords_deduped, k) # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) - output_dir = Path('out_resampled') - output_dir.mkdir(exist_ok=True, parents=True) - out_shp = output_dir.joinpath(shp.name) - pts_deduped.to_file(out_shp.as_posix()) - print(f"saved intersected shapefile at {out_shp.as_posix()}") # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False) - return out_shp + return pts_deduped def __extract_raster_points(raster, coords_deduped): @@ -162,9 +162,12 @@ def __extract_raster_points(raster, coords_deduped): return [x[0] for x in src.sample(coords_deduped)] -def intersect_sample_and_clean(shp, dedupe: bool = False): - out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe) - df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix()) +def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False, + intersect_parallel: bool = False): + df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel) + output_dir.mkdir(exist_ok=True, parents=True) + output_dir.joinpath('clean') + out_shp = output_dir.joinpath(shp.name) df3 = df2[list(geotifs.values())] df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] @@ -172,8 +175,11 @@ def intersect_sample_and_clean(shp, dedupe: bool = False): if df5.shape[0]: df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") - df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) + if write_dropped: + df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) else: + df2.to_file(out_shp.as_posix()) + print(f"saved intersected shapefile at {out_shp.as_posix()}") print(f"No points dropped and there for _cleaned.shp file is not createed'") print(f"No points dropped and there for _cleaned_dropped.shp file is not created") except: @@ -182,10 +188,8 @@ def intersect_sample_and_clean(shp, dedupe: bool = False): if __name__ == '__main__': # local - # tif_local = data_location.joinpath('LATITUDE_GRID1.tif') + output_dir = Path('out_resampled') shapefile_location = Path("configs/data") - # shp = shapefile_location.joinpath('geochem_sites.shp') - downscale_factor = 2 # keep 1 point in a 2x2 cell geom_cols = ['POINT_X', 'POINT_Y'] @@ -212,7 +216,7 @@ def intersect_sample_and_clean(shp, dedupe: bool = False): print('='*100) for s in shapefile_location.glob("*.shp"): - intersect_sample_and_clean(s, True) + intersect_sample_and_clean(s, True, True) # rets = Parallel( # n_jobs=-1, # verbose=100, From b232364601525ac28837721b1a07ae4a739dccde Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Fri, 1 Apr 2022 13:48:10 +1100 Subject: [PATCH 07/33] adding median based filtering of majors dataset --- scripts/dedupe_with_median_scratch.py | 291 ++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 scripts/dedupe_with_median_scratch.py diff --git a/scripts/dedupe_with_median_scratch.py b/scripts/dedupe_with_median_scratch.py new file mode 100644 index 00000000..d24be9cc --- /dev/null +++ b/scripts/dedupe_with_median_scratch.py @@ -0,0 +1,291 @@ +from typing import Dict +import csv +from collections import defaultdict +from os import path +from pathlib import Path +import numpy as np +import pandas as pd +import rasterio +import geopandas as gpd +from joblib import Parallel, delayed + + +def read_list_file(list_path: str): + files = [] + csvfile = path.abspath(list_path) + with open(csvfile, 'r') as f: + reader = csv.reader(f) + tifs = list(reader) + tifs = [f[0].strip() for f in tifs + if (len(f) > 0 and f[0].strip() and + f[0].strip()[0] != '#')] + for f in tifs: + files.append(path.abspath(f)) + return files + + +def generate_key_val(gl, shorts): + eight_char_name = Path(gl).name[:8] + if eight_char_name not in shorts: + shorts[eight_char_name] += 1 + else: + shorts[eight_char_name] += 1 + return gl, eight_char_name + str(shorts[eight_char_name]) + + +# import IPython; IPython.embed(); import sys; sys.exit() + + +# data_location = \ +# Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/") +# Read points from shapefile + +# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/") + +# geotifs = { +# "relief_radius4.tif": "relief4", +# "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred", +# "MvrtpLL_smooth.tif": "mrvtpll_s", +# "MvrtpLL_fin.tif": "mvrtpll_f", +# "mrvbf_9.tif": "mrvbf_9", +# "Rad2016K_Th.tif": "rad2016kth", +# "Thorium_2016.tif": "th_2016", +# "Mesozoic_older_raster_MEAN.tif": "meso_mean", +# "LOC_distance_to_coast.tif": "loc_dis", +# "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb", +# "water-85m_1.tif": "water_85m", +# "clim_RSM_albers.tif": "clim_rsm", +# "tpi_300.tif": "tpi_300", +# "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir", +# "si_geol1.tif": "si_geol1", +# "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay", +# "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr", +# "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl", +# "Gravity_land.tif": "gravity", +# "dem_fill.tif": "dem", +# "Clim_Prescott_LindaGregory.tif": "clim_linda", +# "slope_fill2.tif": "slopefill2", +# "clim_PTA_albers.tif": "clim_alber", +# "SagaWET9cell_M.tif": "sagawet", +# "ceno_euc_aust1.tif": "ceno_euc", +# "s2-dpca-85m_band1.tif": "s2_band1", +# "s2-dpca-85m_band2.tif": "s2_band2", +# "s2-dpca-85m_band3.tif": "s2_band3", +# "3dem_mag0_finn.tif": "3dem_mag0", +# "3dem_mag1_fin.tif": "3dem_mag1", +# "3dem_mag2.tif": "3dem_mag2", +# } + +# geotifs = { +# "relief_apsect.tif": "relief4", +# "LATITUDE_GRID1.tif": "latitude", +# "LONGITUDE_GRID1.tif": "longitude", +# "er_depg.tif": "er_depg", +# "sagawet_b_sir.tif": "sagawet", +# "dem_foc2.tif": "dem_foc2", +# "outcrop_dis2.tif": "outcrop", +# "k_15v5.tif": "k_15v5", +# } + +majors = [ + "SIO2", + "TIO2", + "AL2O3", + "FE2O3TOT", + "FE2O3", + "FEO", + "MNO", + "MGO", + "CAO", + "NA2O", + "K2O", + "P2O5", +] + +def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], + dedupe: bool = False, + intersect_parallel: bool = False, + pixel_count: bool = False, + dedupe_with_median: bool = True + ): + print("====================================\n", f"intersecting {shp.as_posix()}") + pts = gpd.read_file(shp) + coords = np.array([(p.x, p.y) for p in pts.geometry]) + geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) + pts = pts.merge(geom, left_index=True, right_index=True) + if dedupe: + tif_name = list(geotifs.keys())[0] + tif = Path(tif_name) + # orig_cols = pts.columns + with rasterio.open(tif) as src: + # resample data to target shape + data = src.read( + out_shape=( + src.count, + int(src.height / downscale_factor), + int(src.width / downscale_factor) + ), + resampling=rasterio.enums.Resampling.bilinear + ) + # scale image transform + transform = src.transform * src.transform.scale( + (src.width / data.shape[-1]), + (src.height / data.shape[-2]) + ) + pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1]) + + # keep one only + # pts_deduped = pts.sort_values( + # by=geom_cols, ascending=[True, True] + # ).groupby(by=['rows', 'cols'], as_index=False).first()[orig_cols] + + # keep mean of repeated observations in a pixel + # + pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(px_count=('rows', 'count')) + # if dedupe_with_median: + # import IPython; IPython.embed(); import sys; sys.exit() + + +pts_majors = pts[majors + ['rows', 'cols'] + geom_cols + ['geometry']] +for m in majors: + pts_majors[m] = pts_majors[m].apply(lambda x: np.nan if x == 'NA' else float(x)) +pts_median = pts_majors[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).median() +pts_deduped = pts_median.merge(pts_count, how='inner', on=['rows', 'cols']) +rename_dict = {k: k + '_m' for k in majors} +pts_deduped = pts_deduped.rename(mapper=rename_dict, axis=1) +all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=['rows', 'cols']) +for m in majors: + all_pts_with_count_and_median[m + '_distance_from_median'] = \ + abs(all_pts_with_count_and_median[m]- all_pts_with_count_and_median[m + '_m'])/ \ + abs(all_pts_with_count_and_median[m + '_m']) + +def sum_distance_from_median(x): + s = np.nansum([v for k, v in x.items() if k.endswith('_distance_from_median')]) + return s + +all_pts_with_count_and_median['median_sum'] = all_pts_with_count_and_median.apply(lambda x: sum_distance_from_median( + x), axis=1) + +all_pts_with_count_and_median_min_median_sum = \ + all_pts_with_count_and_median.groupby(by=['rows', 'cols'], as_index=False).agg( + median_sum_min=('median_sum', 'min'), + median_sum_std=('median_sum', 'std') + ) + +all_pts = all_pts_with_count_and_median.merge(all_pts_with_count_and_median_min_median_sum, how='left', on=['rows', + 'cols']) + + +# all_pts['median_sum_to_median_min_diff'] = all_pts['median_sum'] - all_pts['median_sum_min'] + +# min_pts = all_pts.groupby(by=['rows', 'cols'], as_index=False).median_sum_to_median_min_diff.agg( +# ['min', 'std'] +# ) + +final_pts = all_pts[all_pts.median_sum==all_pts.median_sum_min] +final_pts_sorted = final_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True]) +all_pts_sorted = all_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True]) + +# all_pts.merge(min_pts, how="inner", on=['rows', 'cols', "median_sum_to_median_min_diff"]) + +# all_pts_majors_median_filtered = pts_count.merge(all_pts[all_pts.median_sum == all_pts.median_sum_min], +# how='left',on=['rows', 'cols']) + + else: # dedupe with mean + pts_mean = pts[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).mean() + pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) + # import IPython; IPython.embed(); import sys; sys.exit() + pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'], + pts_deduped['POINT_Y'])) + # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols] + coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry]) + else: + pts_deduped = pts + coords_deduped = coords + + # for k, v in geotifs.items(): + # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + # pts_deduped[v] = __extract_raster_points(coords_deduped, k) + if intersect_parallel: + pts_deduped__ = Parallel( + n_jobs=-1, + verbose=100, + )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys()) + + for i, col_name in enumerate(geotifs.values()): + pts_deduped[col_name] = pts_deduped__[i] + else: + for i, (k, v) in enumerate(geotifs.items()): + print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + pts_deduped[v] = __extract_raster_points(coords_deduped, k) + + # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) + # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False) + return pts_deduped + + +def __extract_raster_points(raster, coords_deduped): + with rasterio.open(Path(raster)) as src: + print(f"---- intersecting {raster}--------") + return [x[0] for x in src.sample(coords_deduped)] + + +def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False, + intersect_parallel: bool = False): + df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel) + output_dir.mkdir(exist_ok=True, parents=True) + output_dir.joinpath('clean') + out_shp = output_dir.joinpath(shp.name) + df3 = df2[list(geotifs.values())] + df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] + df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] + try: + if df5.shape[0]: + df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) + print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") + if write_dropped: + df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) + else: + df2.to_file(out_shp.as_posix()) + print(f"saved intersected shapefile at {out_shp.as_posix()}") + print(f"No points dropped and there for _cleaned.shp file is not createed'") + print(f"No points dropped and there for _cleaned_dropped.shp file is not created") + except: + print(f"Check this shapefile {shp}") + + +if __name__ == '__main__': + # local + output_dir = Path('out_resampled') + shapefile_location = Path("configs/data") + downscale_factor = 2 # keep 1 point in a 2x2 cell + + geom_cols = ['POINT_X', 'POINT_Y'] + + covariastes_list = "configs/data/sirsam/covariates_list.txt" + geotifs_list = read_list_file(list_path=covariastes_list) + shorts = defaultdict(int) + + geotifs = {} + + for gl in geotifs_list: + k, v = generate_key_val(gl, shorts) + geotifs[k] = v + + # check all required files are available on disc + for k, v in geotifs.items(): + print(f"checking if {k} exists") + assert Path(k).exists() + + print("Add under the 'intersected_features:' section in yaml") + print('='*100) + for k, v in geotifs.items(): + print(f"\"{Path(k).name}\": \"{v}\"") + + print('='*100) + for s in shapefile_location.glob("*.shp"): + intersect_sample_and_clean(s, True, True) + # rets = Parallel( + # n_jobs=-1, + # verbose=100, + # )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp")) From a1473f128939780e7d021515fdff0ac2be66e40a Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Sat, 9 Apr 2022 07:23:55 +1000 Subject: [PATCH 08/33] updated --- scripts/dedupe_with_median_scratch.py | 36 ++++++++++++++------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/scripts/dedupe_with_median_scratch.py b/scripts/dedupe_with_median_scratch.py index d24be9cc..e0bbc3fb 100644 --- a/scripts/dedupe_with_median_scratch.py +++ b/scripts/dedupe_with_median_scratch.py @@ -109,10 +109,10 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe_with_median: bool = True ): print("====================================\n", f"intersecting {shp.as_posix()}") - pts = gpd.read_file(shp) - coords = np.array([(p.x, p.y) for p in pts.geometry]) - geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) - pts = pts.merge(geom, left_index=True, right_index=True) +pts = gpd.read_file(shp) +coords = np.array([(p.x, p.y) for p in pts.geometry]) +geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) +pts = pts.merge(geom, left_index=True, right_index=True) if dedupe: tif_name = list(geotifs.keys())[0] tif = Path(tif_name) @@ -137,24 +137,26 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], # keep one only # pts_deduped = pts.sort_values( # by=geom_cols, ascending=[True, True] - # ).groupby(by=['rows', 'cols'], as_index=False).first()[orig_cols] + # ).groupby(by=rows_and_cols, as_index=False).first()[orig_cols] # keep mean of repeated observations in a pixel # - pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(px_count=('rows', 'count')) + pts_count = pts.groupby(by=rows_and_cols, as_index=False).agg(px_count=('rows', 'count')) # if dedupe_with_median: # import IPython; IPython.embed(); import sys; sys.exit() +rows_and_cols = ['rows', 'cols'] -pts_majors = pts[majors + ['rows', 'cols'] + geom_cols + ['geometry']] +pts_majors = pts[majors + rows_and_cols + geom_cols + ['geometry']] for m in majors: - pts_majors[m] = pts_majors[m].apply(lambda x: np.nan if x == 'NA' else float(x)) -pts_median = pts_majors[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).median() -pts_deduped = pts_median.merge(pts_count, how='inner', on=['rows', 'cols']) + pts[m] = pts[m].apply(lambda x: np.nan if x == 'NA' else float(x)) +pts_median = pts_majors[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).median() +pts_deduped = pts_median.merge(pts_count, how='inner', on=rows_and_cols) rename_dict = {k: k + '_m' for k in majors} pts_deduped = pts_deduped.rename(mapper=rename_dict, axis=1) -all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=['rows', 'cols']) +all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=rows_and_cols) for m in majors: + print(m) all_pts_with_count_and_median[m + '_distance_from_median'] = \ abs(all_pts_with_count_and_median[m]- all_pts_with_count_and_median[m + '_m'])/ \ abs(all_pts_with_count_and_median[m + '_m']) @@ -167,7 +169,7 @@ def sum_distance_from_median(x): x), axis=1) all_pts_with_count_and_median_min_median_sum = \ - all_pts_with_count_and_median.groupby(by=['rows', 'cols'], as_index=False).agg( + all_pts_with_count_and_median.groupby(by=rows_and_cols, as_index=False).agg( median_sum_min=('median_sum', 'min'), median_sum_std=('median_sum', 'std') ) @@ -178,7 +180,7 @@ def sum_distance_from_median(x): # all_pts['median_sum_to_median_min_diff'] = all_pts['median_sum'] - all_pts['median_sum_min'] -# min_pts = all_pts.groupby(by=['rows', 'cols'], as_index=False).median_sum_to_median_min_diff.agg( +# min_pts = all_pts.groupby(by=rows_and_cols, as_index=False).median_sum_to_median_min_diff.agg( # ['min', 'std'] # ) @@ -189,15 +191,15 @@ def sum_distance_from_median(x): # all_pts.merge(min_pts, how="inner", on=['rows', 'cols', "median_sum_to_median_min_diff"]) # all_pts_majors_median_filtered = pts_count.merge(all_pts[all_pts.median_sum == all_pts.median_sum_min], -# how='left',on=['rows', 'cols']) +# how='left',on=rows_and_cols) else: # dedupe with mean - pts_mean = pts[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).mean() - pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) + pts_mean = pts[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).mean() + pts_deduped = pts_mean.merge(pts_count, how='inner', on=rows_and_cols) # import IPython; IPython.embed(); import sys; sys.exit() pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y'])) - # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols] + # pts_deduped = pts.drop_duplicates(subset=rows_and_cols)[orig_cols] coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry]) else: pts_deduped = pts From 39e04f4319e4ae0e53804f0b55e8885beef72c5e Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Sun, 10 Apr 2022 08:59:14 +1000 Subject: [PATCH 09/33] add interpolate shapes script used in nci for wa models --- scripts/interpolate_shapes.py | 63 +++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 scripts/interpolate_shapes.py diff --git a/scripts/interpolate_shapes.py b/scripts/interpolate_shapes.py new file mode 100644 index 00000000..83b34e8f --- /dev/null +++ b/scripts/interpolate_shapes.py @@ -0,0 +1,63 @@ +from functools import partial +from pathlib import Path +from scipy import interpolate +import numpy as np +import pandas as pd +import geopandas as gpd +from joblib import Parallel, delayed + + +def interpolate_1d(row, thicknesses): + conductivities = row[conductivity_cols].values + f = interpolate.interp1d(thicknesses, conductivities, bounds_error=False, + fill_value=(conductivities[0], conductivities[-1]), assume_sorted=True) + new_conductivities = f(new_thicknesses) + return new_conductivities + + +def interpolate_shape(shp): + print(f"interpolating {shp}") + df = gpd.read_file(shp) + thicknesses = df.loc[0, thickness_cols].values.cumsum() + interpolate_1d_local = partial(interpolate_1d, thicknesses=thicknesses) + interpolated_conductivities = df.apply( + func=interpolate_1d_local, + axis=1, + raw=False, + ) + df.loc[:, conductivity_cols] = np.vstack(interpolated_conductivities) + df.loc[:, thickness_cols] = np.tile(new_thicknesses, (df.shape[0], 1)) + target_shp = output_dir.joinpath(Path(shp).name) + df.to_file(target_shp) + print(f"wrote interpolated shapefile {target_shp}") + + +if __name__ == '__main__': + + ref_df = gpd.read_file("./NT_1_albers.shp", rows=1) + + shapes_to_convert = [ + './AEM_GAB_albers.shp', + './Albany_1_albers_cleaned.shp', + './Albany_2_albers_mod_cleaned.shp', + './Murchison_1_albers_cleaned.shp', + './Murchison_2_albers_cleaned.shp', + './Murchison_3_albers.shp', + './frome_1_albers.shp', + './frome_2_albers.shp', + './frome_3_albers.shp', + './frome_4_albers.shp', + './frome_5_albers.shp', + './frome_6_albers.shp' + ] + output_dir = Path('interpolated') + output_dir.mkdir(exist_ok=True, parents=True) + + conductivity_cols = [c for c in ref_df.columns if c.startswith('cond_')] + thickness_cols = [t for t in ref_df.columns if t.startswith('thick_')] + new_thicknesses = np.array(ref_df.loc[0, thickness_cols].values.cumsum(), dtype=np.float32) + + rets = Parallel( + n_jobs=-1, + verbose=100, + )(delayed(interpolate_shape)(s) for s in shapes_to_convert) \ No newline at end of file From 11ed0d262491619e3265ccc0e94d7ebb008f4110 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 12 Apr 2022 15:15:31 +1000 Subject: [PATCH 10/33] paper plot --- scripts/plot_true_vs_prediction.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/plot_true_vs_prediction.py b/scripts/plot_true_vs_prediction.py index 819ad888..f7f4902a 100644 --- a/scripts/plot_true_vs_prediction.py +++ b/scripts/plot_true_vs_prediction.py @@ -99,3 +99,25 @@ def plot_quantiles(line): plt.show() + + +xx = line.d +y_test = line.Prediction +y_lower = line['Lower_quan'] +y_upper = line['Upper_quan'] +plt.plot(xx, line.y_true, "b.", markersize=10, label="Observed log10(conductivity)") +plt.plot(xx, line.out, "g", markersize=10, linestyle='-', + label="Kriged log10(conductivity)") +plt.plot(xx, y_test, color='orange', linestyle='-', + # marker='o', + label="Predicted median") +plt.plot(xx, y_upper, "k-") +plt.plot(xx, y_lower, "k-") +plt.fill_between( + # (95362.213934-line.d).ravel() + (line.d).ravel(), + y_lower, y_upper, alpha=0.4, label='Predicted 90% interval' +) +plt.legend(loc='upper left') +plt.ylabel('log10(conductivity)') +plt.xlabel('Distance along flight line (meters)') From 3c333e04ba3c7763ec784b547574c808eff73f8d Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Sat, 28 May 2022 11:10:31 +1000 Subject: [PATCH 11/33] update script --- scripts/intersect_rasters.py | 79 +++++++----------------------------- 1 file changed, 15 insertions(+), 64 deletions(-) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index 0aef9989..a33529b7 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -33,64 +33,13 @@ def generate_key_val(gl, shorts): return gl, eight_char_name + str(shorts[eight_char_name]) -# import IPython; IPython.embed(); import sys; sys.exit() - - -# data_location = \ -# Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/") -# Read points from shapefile - -# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/") - -# geotifs = { -# "relief_radius4.tif": "relief4", -# "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred", -# "MvrtpLL_smooth.tif": "mrvtpll_s", -# "MvrtpLL_fin.tif": "mvrtpll_f", -# "mrvbf_9.tif": "mrvbf_9", -# "Rad2016K_Th.tif": "rad2016kth", -# "Thorium_2016.tif": "th_2016", -# "Mesozoic_older_raster_MEAN.tif": "meso_mean", -# "LOC_distance_to_coast.tif": "loc_dis", -# "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb", -# "water-85m_1.tif": "water_85m", -# "clim_RSM_albers.tif": "clim_rsm", -# "tpi_300.tif": "tpi_300", -# "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir", -# "si_geol1.tif": "si_geol1", -# "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay", -# "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr", -# "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl", -# "Gravity_land.tif": "gravity", -# "dem_fill.tif": "dem", -# "Clim_Prescott_LindaGregory.tif": "clim_linda", -# "slope_fill2.tif": "slopefill2", -# "clim_PTA_albers.tif": "clim_alber", -# "SagaWET9cell_M.tif": "sagawet", -# "ceno_euc_aust1.tif": "ceno_euc", -# "s2-dpca-85m_band1.tif": "s2_band1", -# "s2-dpca-85m_band2.tif": "s2_band2", -# "s2-dpca-85m_band3.tif": "s2_band3", -# "3dem_mag0_finn.tif": "3dem_mag0", -# "3dem_mag1_fin.tif": "3dem_mag1", -# "3dem_mag2.tif": "3dem_mag2", -# } - -# geotifs = { -# "relief_apsect.tif": "relief4", -# "LATITUDE_GRID1.tif": "latitude", -# "LONGITUDE_GRID1.tif": "longitude", -# "er_depg.tif": "er_depg", -# "sagawet_b_sir.tif": "sagawet", -# "dem_foc2.tif": "dem_foc2", -# "outcrop_dis2.tif": "outcrop", -# "k_15v5.tif": "k_15v5", -# } - - -def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False, intersect_parallel: bool = False): +def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False, + intersect_parallel: bool = False): print("====================================\n", f"intersecting {shp.as_posix()}") pts = gpd.read_file(shp) + for g in geom_cols: + if g in pts.columns: + pts = pts.drop(g) coords = np.array([(p.x, p.y) for p in pts.geometry]) geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) pts = pts.merge(geom, left_index=True, right_index=True) @@ -124,7 +73,6 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = # pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count')) pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean() - # import IPython; IPython.embed(); import sys; sys.exit() pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'], @@ -139,6 +87,7 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") # pts_deduped[v] = __extract_raster_points(coords_deduped, k) if intersect_parallel: + print("=====================Intersecting parallelly =======================") pts_deduped__ = Parallel( n_jobs=-1, verbose=100, @@ -149,14 +98,14 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = else: for i, (k, v) in enumerate(geotifs.items()): print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") - pts_deduped[v] = __extract_raster_points(coords_deduped, k) + pts_deduped[v] = __extract_raster_points(k, coords_deduped) # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False) return pts_deduped -def __extract_raster_points(raster, coords_deduped): +def __extract_raster_points(raster: str, coords_deduped: np.ndarray): with rasterio.open(Path(raster)) as src: print(f"---- intersecting {raster}--------") return [x[0] for x in src.sample(coords_deduped)] @@ -169,8 +118,10 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = output_dir.joinpath('clean') out_shp = output_dir.joinpath(shp.name) df3 = df2[list(geotifs.values())] - df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] - df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] + finite_idices = ((np.isfinite(df3)).sum(axis=1) == len(geotifs)) & \ + ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)) + df4 = df2.loc[finite_idices, :] + df5 = df2.loc[~finite_idices, :] try: if df5.shape[0]: df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) @@ -190,7 +141,7 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = # local output_dir = Path('out_resampled') shapefile_location = Path("configs/data") - downscale_factor = 2 # keep 1 point in a 2x2 cell + downscale_factor = 1 # keep 1 point in a 2x2 cell geom_cols = ['POINT_X', 'POINT_Y'] @@ -215,8 +166,8 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = print(f"\"{Path(k).name}\": \"{v}\"") print('='*100) - for s in shapefile_location.glob("*.shp"): - intersect_sample_and_clean(s, True, True) + for s in shapefile_location.glob("geochem_sites.shp"): + intersect_sample_and_clean(s, dedupe=True, write_dropped=True, intersect_parallel=True) # rets = Parallel( # n_jobs=-1, # verbose=100, From c32ddaeb6912ab2e32cf300fc3c941ac8e6679af Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Mon, 4 Jul 2022 12:51:48 +1000 Subject: [PATCH 12/33] [predict] try once without cleaning for all algos --- uncoverml/predict.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/uncoverml/predict.py b/uncoverml/predict.py index f5abb696..512ecdd4 100644 --- a/uncoverml/predict.py +++ b/uncoverml/predict.py @@ -128,8 +128,7 @@ def _fix_for_corrupt_data(x, feature_names): x.data[min_mask] = float32finfo.min x.data[max_mask] = float32finfo.max x.mask += min_mask + max_mask - problem_feature_names = \ - list(compress(feature_names, ~isfinite.all(axis=0))) + problem_feature_names = list(compress(feature_names, ~isfinite.all(axis=0))) print("Warning: Float64 data was truncated to float32 max/min values." "Check your covariates for possible nodatavalue, datatype, " @@ -160,15 +159,6 @@ def _get_data(subchunk, config): log.info("Applying feature transforms") x = features.transform_features(extracted_chunk_sets, transform_sets, config.final_transform, config)[0] - # only check/correct float32 conversion for Ensemble models - if config.pca: - x = _fix_for_corrupt_data(x, features_names) - return _mask_rows(x, subchunk, config), features_names - - if not config.clustering: - if (isinstance(modelmaps[config.algorithm](), BaseEnsemble) or - config.multirandomforest): - x = _fix_for_corrupt_data(x, features_names) return _mask_rows(x, subchunk, config), features_names @@ -213,8 +203,12 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf log.info("Loaded {:2.4f}GB of image data".format(total_gb)) alg = config.algorithm log.info("Predicting targets for {}.".format(alg)) - y_star = predict(x, model, interval=config.quantiles, - lon_lat=_get_lon_lat(subchunk, config)) + try: + y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config)) + except ValueError as v: + log.warning(v) + x = _fix_for_corrupt_data(x, features_names) + y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config)) if config.cluster and config.cluster_analysis: cluster_analysis(x, y_star, subchunk, config, feature_names) # cluster_analysis(x, y_star, subchunk, config, feature_names) From 00d41baea16680afa751a3a1daa6174e79ea66fe Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Sat, 23 Jul 2022 05:55:11 +1000 Subject: [PATCH 13/33] update some scripts --- configs/demo_regression.yaml | 1 - configs/group_and_split_shapefile.yaml | 0 configs/reference_gb.yaml | 42 ++++++++--------------- configs/reference_xgboost.yaml | 28 ++++++++-------- configs/resampling.yaml | 4 +-- scripts/check_covariates.py | 46 ++++++++++++++++++++++++++ scripts/interpolate_shapes.py | 31 +++++++++-------- scripts/intersect_rasters.py | 3 +- 8 files changed, 96 insertions(+), 59 deletions(-) mode change 100755 => 100644 configs/group_and_split_shapefile.yaml mode change 100755 => 100644 configs/resampling.yaml create mode 100644 scripts/check_covariates.py diff --git a/configs/demo_regression.yaml b/configs/demo_regression.yaml index eda84ce0..c120e7aa 100644 --- a/configs/demo_regression.yaml +++ b/configs/demo_regression.yaml @@ -33,7 +33,6 @@ mask: # imputation options are gaus, nn, mean preprocessing: imputation: mean - transforms: # - whiten: # keep_fraction: 0.8 diff --git a/configs/group_and_split_shapefile.yaml b/configs/group_and_split_shapefile.yaml old mode 100755 new mode 100644 diff --git a/configs/reference_gb.yaml b/configs/reference_gb.yaml index 2c1717a8..c5f7f787 100644 --- a/configs/reference_gb.yaml +++ b/configs/reference_gb.yaml @@ -58,17 +58,17 @@ learning: algorithm: gradientboost arguments: target_transform: identity - loss: 'ls' - max_depth: 20 - learning_rate: 0.1 - n_estimators: 100 - subsample: 0.9 - min_samples_split: 2 - min_samples_leaf: 2 - min_weight_fraction_leaf: 0.0 - max_features: "auto" - alpha: 0.95 - random_state: 3 +# loss: 'ls' +# max_depth: 20 +# learning_rate: 0.1 + n_estimators: 200 +# subsample: 0.9 +# min_samples_split: 2 +# min_samples_leaf: 2 +# min_weight_fraction_leaf: 0.0 +# max_features: "auto" +# alpha: 0.95 +# random_state: 3 optimisation: hyperopt_params: max_evals: 5 @@ -89,22 +89,6 @@ learning: min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5) max_leaf_nodes: randint('max_leaf_nodes', 10, 50) - searchcv_params: - n_iter: 6 - cv: 2 - verbose: 1000 - n_points: 3 - n_jobs: 6 - params_space: - 'max_depth': Integer(1, 15) - 'learning_rate': Real(10 ** -5, 10 ** 0, prior="log-uniform") - 'n_estimators': Integer(10, 100) - 'subsample': Real(0.01, 1.0, prior='uniform') - 'max_features': Categorical(['auto', 'sqrt', 'log2']) - 'min_samples_split': Integer(2, 50) - 'min_samples_leaf': Integer(1, 50) - 'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform') - 'max_leaf_nodes': Integer(10, 50) prediction: # corner_coordinates: @@ -114,10 +98,10 @@ prediction: outbands: 1 validation: - #- feature_rank + - feature_rank - parallel - k-fold: - folds: 3 + folds: 5 random_seed: 1 output: diff --git a/configs/reference_xgboost.yaml b/configs/reference_xgboost.yaml index dcd762c7..7101cd73 100644 --- a/configs/reference_xgboost.yaml +++ b/configs/reference_xgboost.yaml @@ -1,26 +1,29 @@ - experiment: my_run patchsize: 0 memory_fraction: 0.5 features: - - name: my continuous features + - name: my continuous features 2 type: continuous files: - - path: configs/data/sirsam/er_depg.tif - - path: configs/data/sirsam/sagawet_b_sir.tif - path: configs/data/sirsam/dem_foc2.tif - path: configs/data/sirsam/outcrop_dis2.tif -# - path: configs/data/sirsam/k_15v5.tif - path: configs/data/sirsam/relief_apsect.tif -# - path: configs/data/sirsam/LATITUDE_GRID1.tif -# - path: configs/data/sirsam/LONGITUDE_GRID1.tif -# - directory: configs/data/weights/ transforms: - identity # - whiten: # keep_fraction: 0.98 - imputation: none + imputation: mean + - name: my continuous features 1 + type: continuous + files: + - path: configs/data/sirsam/er_depg.tif + - path: configs/data/sirsam/sagawet_b_sir.tif + transforms: + - identity + # - whiten: + # keep_fraction: 0.98 + imputation: gauss preprocessing: imputation: none @@ -125,7 +128,7 @@ prediction: outbands: 1 validation: - #- feature_rank +# - feature_rank - parallel # - permutation_importance # - feature_importance @@ -134,9 +137,8 @@ validation: random_seed: 1 oos_validation: - file: configs/data/weights/Gamma_K_50.shp - property: K2O + file: configs/data/geochem_sites.shp + property: K_ppm_imp output: directory: ref_xgb/ - diff --git a/configs/resampling.yaml b/configs/resampling.yaml old mode 100755 new mode 100644 index 791831b0..c84cfc73 --- a/configs/resampling.yaml +++ b/configs/resampling.yaml @@ -2,8 +2,8 @@ # when undersample == true, we choose points with or without bootstrap targets: - file: configs/data/weights/Gamma_K_50.shp - property: K2O + file: configs/data/sirsam/out_resampled/geochem_sites.shp + property: K_ppm_imp resample: spatial: undersample: false diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py new file mode 100644 index 00000000..5aa67766 --- /dev/null +++ b/scripts/check_covariates.py @@ -0,0 +1,46 @@ +from joblib import delayed, Parallel +from pathlib import Path +import rasterio as rio +import numpy as np +import pandas as pd + +dir = "configs/data/sirsam" + +mask = Path(dir).joinpath("dem_foc2.tif") + +with rio.open(mask) as geotif: + mask_raster = geotif.read(1, masked=True) + + +def _parallel_read(r): + try: + with rio.open(r) as geotif: + raster = geotif.read(1, masked=True) + m = geotif.meta + m['crs'] = m['crs'].to_string() + t = m.pop('transform') + m['pixsize_x'] = t[0] + m['top_left_x'] = t[2] + m['pixsize_y'] = -t[4] + m['top_left_y'] = t[5] + raster.mask = raster.mask | mask_raster.mask + m['all_finite'] = np.all(np.isfinite(raster)) + m['any_nan'] = np.any(np.isnan(raster)) + m['any_large'] = np.any(np.abs(raster) > 1e10) + # raster_attrs[r.stem] = m + return m + except Exception as e: + print(r) + print(e) + return [None] * 14 + + +rets = Parallel( + n_jobs=-1, + verbose=100, +)(delayed(_parallel_read)(r) for r in Path(dir).glob("*.tif")) + +raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("*.tif"), rets)} + +df = pd.DataFrame.from_dict(raster_attrs) +df.to_csv("quality.csv") diff --git a/scripts/interpolate_shapes.py b/scripts/interpolate_shapes.py index 83b34e8f..5140cb59 100644 --- a/scripts/interpolate_shapes.py +++ b/scripts/interpolate_shapes.py @@ -34,21 +34,26 @@ def interpolate_shape(shp): if __name__ == '__main__': - ref_df = gpd.read_file("./NT_1_albers.shp", rows=1) + ref_df = gpd.read_file("/g/data/ge3/aem_sections/AEM_covariates/NT_1_albers.shp", rows=1) shapes_to_convert = [ - './AEM_GAB_albers.shp', - './Albany_1_albers_cleaned.shp', - './Albany_2_albers_mod_cleaned.shp', - './Murchison_1_albers_cleaned.shp', - './Murchison_2_albers_cleaned.shp', - './Murchison_3_albers.shp', - './frome_1_albers.shp', - './frome_2_albers.shp', - './frome_3_albers.shp', - './frome_4_albers.shp', - './frome_5_albers.shp', - './frome_6_albers.shp' + # "/g/data/ge3/data/AEM_shapes/AEM_ERC_ALL_interp_data_Albers_Ceno.shp", + "/g/data/ge3/data/AEM_shapes/East_C_1_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_1_2_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_1_3_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers_mod.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers_mod.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_3_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_2_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_3_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_4_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_5_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_6_albers.shp", + "/g/data/ge3/data/AEM_shapes/Mundi_Mundi_albers.shp", ] output_dir = Path('interpolated') output_dir.mkdir(exist_ok=True, parents=True) diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index a33529b7..ea50d789 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -133,7 +133,8 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = print(f"saved intersected shapefile at {out_shp.as_posix()}") print(f"No points dropped and there for _cleaned.shp file is not createed'") print(f"No points dropped and there for _cleaned_dropped.shp file is not created") - except: + except Exception as e: + print(e) print(f"Check this shapefile {shp}") From 4129050e7fb7bb105778415160b90ba2cd487b0b Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Sat, 27 Aug 2022 06:23:58 +1000 Subject: [PATCH 14/33] dedupe shape script added --- scripts/check_covariates.py | 4 +-- scripts/dedupe_shape.py | 70 +++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 scripts/dedupe_shape.py diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py index 5aa67766..24970320 100644 --- a/scripts/check_covariates.py +++ b/scripts/check_covariates.py @@ -23,7 +23,7 @@ def _parallel_read(r): m['top_left_x'] = t[2] m['pixsize_y'] = -t[4] m['top_left_y'] = t[5] - raster.mask = raster.mask | mask_raster.mask + raster.mask = raster.mask | mask_raster.mask # we are not interested in masked areas m['all_finite'] = np.all(np.isfinite(raster)) m['any_nan'] = np.any(np.isnan(raster)) m['any_large'] = np.any(np.abs(raster) > 1e10) @@ -40,7 +40,7 @@ def _parallel_read(r): verbose=100, )(delayed(_parallel_read)(r) for r in Path(dir).glob("*.tif")) -raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("*.tif"), rets)} +raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("*.tif"), rets)} df = pd.DataFrame.from_dict(raster_attrs) df.to_csv("quality.csv") diff --git a/scripts/dedupe_shape.py b/scripts/dedupe_shape.py new file mode 100644 index 00000000..42c2b440 --- /dev/null +++ b/scripts/dedupe_shape.py @@ -0,0 +1,70 @@ +from pathlib import Path +from joblib import Parallel, delayed +import numpy as np +import pandas as pd +import rasterio +import geopandas as gpd + + +def dedupe_raster(shp: Path, tif: Path, deduped_shp: Path): + """ + :param shp: input shapefile with dense points + :param tif: sample tif to read resolution details + :param deduped_shp: output shapefile with one point per down-sampled raster resolution + :return: + """ + print("====================================\n", f"deduping {shp.as_posix()}") + geom_cols = ['POINT_X', 'POINT_Y'] + pts = gpd.read_file(shp) + for g in geom_cols: + if g in pts.columns: + pts = pts.drop(g, axis=1) + coords = np.array([(p.x, p.y) for p in pts.geometry]) + geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) + pts = pts.merge(geom, left_index=True, right_index=True) + + with rasterio.open(tif) as src: + # resample data to target shape + data = src.read( + out_shape=( + src.count, + int(src.height / downscale_factor), + int(src.width / downscale_factor) + ), + resampling=rasterio.enums.Resampling.bilinear + ) + # scale image transform + transform = src.transform * src.transform.scale( + (src.width / data.shape[-1]), + (src.height / data.shape[-2]) + ) + pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1]) + + pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count')) + pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean() + pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) + + pts_deduped = gpd.GeoDataFrame(pts_deduped, + geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y']), + crs="EPSG:3577" # Australian Albers + ) + pts_deduped.to_file(deduped_shp.as_posix()) + return pts_deduped + + +if __name__ == '__main__': + shapefiles = Path("configs/data/") + downscale_factor = 6 # keep 1 point in a 6x6 cell + + dem = Path('/home/my_dem.tif') + output_dir = Path('1in6') + output_dir.mkdir(exist_ok=True, parents=True) + + # for s in shapefiles.glob("*.shp"): + # deduped_shp = output_dir.joinpath(s.name) + # dedupe_raster(shp=s, tif=dem, deduped_shp=deduped_shp) + + Parallel( + n_jobs=-1, + verbose=100, + )(delayed(dedupe_raster)(s, dem, output_dir.joinpath(s.name)) for s in shapefiles.glob("geochem_sites.shp")) From 508576d13178c447dec96b27f4007774895bdd20 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Sat, 3 Sep 2022 17:53:02 +1000 Subject: [PATCH 15/33] add more statistics in covariate check --- scripts/check_covariates.py | 14 ++++++++++++-- setup.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py index 24970320..3105f0ab 100644 --- a/scripts/check_covariates.py +++ b/scripts/check_covariates.py @@ -1,6 +1,8 @@ +import subprocess from joblib import delayed, Parallel from pathlib import Path import rasterio as rio +from rasterio.io import DatasetReader import numpy as np import pandas as pd @@ -12,10 +14,11 @@ mask_raster = geotif.read(1, masked=True) -def _parallel_read(r): + +def _parallel_read(r: Path): try: with rio.open(r) as geotif: - raster = geotif.read(1, masked=True) + raster: DatasetReader = geotif.read(1, masked=True) m = geotif.meta m['crs'] = m['crs'].to_string() t = m.pop('transform') @@ -27,6 +30,13 @@ def _parallel_read(r): m['all_finite'] = np.all(np.isfinite(raster)) m['any_nan'] = np.any(np.isnan(raster)) m['any_large'] = np.any(np.abs(raster) > 1e10) + m['min'] = np.ma.min(raster) + m['max'] = np.ma.max(raster) + m['std'] = np.ma.std(raster) + m['valid_percent'] = 100 - np.sum(raster.mask) / raster.size * 100 + print("=" * 50) + print(r, m['valid_percent']) + subprocess.run(f"gdalinfo {r.as_posix()} -stats | grep -i valid_", shell=True) # raster_attrs[r.stem] = m return m except Exception as e: diff --git a/setup.py b/setup.py index 068891c6..b14d4aba 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def run(self): 'revrand >= 0.9.10', 'mpi4py==3.1.3', 'scipy==1.6.2', - 'scikit-learn~=0.22.2', + 'scikit-learn==0.22.2.post1', 'scikit-image==0.19.1', 'scikit-optimize == 0.8.1', 'wheel >= 0.29.0', From 78bbc68a27346bb93d7d737f34c118beec890cb1 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Wed, 9 Nov 2022 07:42:17 +1100 Subject: [PATCH 16/33] ref rf with hpopt --- configs/ref_rf.yaml | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml index abdb678f..2f3d8c89 100644 --- a/configs/ref_rf.yaml +++ b/configs/ref_rf.yaml @@ -39,19 +39,35 @@ learning: random_state: 1 max_depth: 20 optimisation: - searchcv_params: - n_iter: 6 + hyperopt_params: + max_evals: 5 + step: 2 cv: 2 - verbose: 1000 - n_points: 3 - n_jobs: 6 - params_space: - 'max_depth': Integer(1, 15) - 'n_estimators': Integer(10, 100) - 'max_features': Categorical(['auto', 'sqrt', 'log2']) - 'min_samples_split': Integer(2, 50) - 'min_samples_leaf': Integer(1, 50) - 'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform') + verbose: true + random_state: 3 + scoring: r2 # r2, neg_mean_absolute_error, etc..see note above + algo: bayes # bayes, or anneal + hp_params_space: + max_depth: randint('max_depth', 1, 15) + n_estimators: randint('n_estimators', 5, 25) + max_features: choice('max_features', ['auto', 'sqrt', 'log2']) + min_samples_split: randint('min_samples_split', 2, 50) + min_samples_leaf: randint('min_samples_leaf', 1, 50) + min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5) + max_leaf_nodes: randint('max_leaf_nodes', 10, 50) +# searchcv_params: +# n_iter: 6 +# cv: 2 +# verbose: 1000 +# n_points: 3 +# n_jobs: 6 +# params_space: +# 'max_depth': Integer(1, 15) +# 'n_estimators': Integer(10, 100) +# 'max_features': Categorical(['auto', 'sqrt', 'log2']) +# 'min_samples_split': Integer(2, 50) +# 'min_samples_leaf': Integer(1, 50) +# 'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform') prediction: From 2284f8396fbf5340158ce50caf8bd90c07d0d074 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Wed, 9 Nov 2022 07:43:08 +1100 Subject: [PATCH 17/33] discard more data during optmisation if they are not finite --- uncoverml/validate.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/uncoverml/validate.py b/uncoverml/validate.py index cc2cf341..5239cff9 100644 --- a/uncoverml/validate.py +++ b/uncoverml/validate.py @@ -372,14 +372,16 @@ def setup_validation_data(X, targets_all, cv_folds, random_state=None): random_state=random_state) rows_with_at_least_one_masked = ~ np.any(X.mask, axis=1) - X = X[rows_with_at_least_one_masked, :] - w = w[rows_with_at_least_one_masked] - y = y[rows_with_at_least_one_masked] - lon_lat = lon_lat[rows_with_at_least_one_masked, :] - groups = groups[rows_with_at_least_one_masked] + finite_X = np.isfinite(X.data).sum(axis=1) == X.shape[1] + valid_rows = rows_with_at_least_one_masked & finite_X + X = X[valid_rows, :] + w = w[valid_rows] + y = y[valid_rows] + lon_lat = lon_lat[valid_rows, :] + groups = groups[valid_rows] for (f, v), a in zip(targets_all.fields.items(), arrays): - targets_all.fields[f] = a[rows_with_at_least_one_masked] + targets_all.fields[f] = a[valid_rows] if len(np.unique(groups)) >= cv_folds: log.info(f'Using GroupKFold with {cv_folds} folds') From e9ed58318c8b0fbe49fe0203294842b292d6e37b Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Fri, 2 Dec 2022 17:24:16 +1100 Subject: [PATCH 18/33] update util scripts --- scripts/check_covariates.py | 19 +++++--- scripts/intersect_raster_with_shape.py | 62 ++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 scripts/intersect_raster_with_shape.py diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py index 3105f0ab..117f556c 100644 --- a/scripts/check_covariates.py +++ b/scripts/check_covariates.py @@ -27,16 +27,17 @@ def _parallel_read(r: Path): m['pixsize_y'] = -t[4] m['top_left_y'] = t[5] raster.mask = raster.mask | mask_raster.mask # we are not interested in masked areas + # print(raster) m['all_finite'] = np.all(np.isfinite(raster)) m['any_nan'] = np.any(np.isnan(raster)) m['any_large'] = np.any(np.abs(raster) > 1e10) m['min'] = np.ma.min(raster) + m['mean'] = np.ma.mean(raster) + m['median'] = np.ma.median(raster) m['max'] = np.ma.max(raster) m['std'] = np.ma.std(raster) - m['valid_percent'] = 100 - np.sum(raster.mask) / raster.size * 100 - print("=" * 50) - print(r, m['valid_percent']) - subprocess.run(f"gdalinfo {r.as_posix()} -stats | grep -i valid_", shell=True) + m['skew'] = 3 * (m['mean'] - m['median']) / m['std'] + # subprocess.run(f"gdalinfo {r.as_posix()} -stats", shell=True, capture_output=True) # raster_attrs[r.stem] = m return m except Exception as e: @@ -46,11 +47,15 @@ def _parallel_read(r: Path): rets = Parallel( - n_jobs=-1, + n_jobs=1, verbose=100, -)(delayed(_parallel_read)(r) for r in Path(dir).glob("*.tif")) +)(delayed(_parallel_read)(r) for r in Path(dir).glob("**/*.tif")) -raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("*.tif"), rets)} +import pickle +with open("rets.pk", 'wb') as f: + pickle.dump(rets, f) + +raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("**/*.tif",), rets)} df = pd.DataFrame.from_dict(raster_attrs) df.to_csv("quality.csv") diff --git a/scripts/intersect_raster_with_shape.py b/scripts/intersect_raster_with_shape.py new file mode 100644 index 00000000..b25a21f9 --- /dev/null +++ b/scripts/intersect_raster_with_shape.py @@ -0,0 +1,62 @@ +# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318_std.tif +# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif + + +import geopandas as gpd +import numpy as np +import pandas as pd +import rasterio as rio + + +# with rasterio.open(tif) as src: +# # resample data to target shape +# data = src.read( +# out_shape=( +# src.count, +# int(src.height / downscale_factor), +# int(src.width / downscale_factor) +# ), +# resampling=rasterio.enums.Resampling.bilinear +# ) +# # scale image transform +# transform = src.transform * src.transform.scale( +# (src.width / data.shape[-1]), +# (src.height / data.shape[-2]) +# ) +# pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1]) + + +def raster_points() -> gpd.GeoDataFrame: + with rio.Env(): + with rio.open( + '/g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif') as src: + crs = src.crs + # create 1D coordinate arrays (coordinates of the pixel center) + xmin, ymax = np.around(src.xy(0.00, 0.00), 9) # src.xy(0, 0) + xmax, ymin = np.around(src.xy(src.height - 1, src.width - 1), 9) # src.xy(src.width-1, src.height-1) + x = np.linspace(xmin, xmax, src.width) + y = np.linspace(ymax, ymin, src.height) # max -> min so coords are top -> bottom + + # create 2D arrays + xs, ys = np.meshgrid(x, y) + zs = src.read(1) + + # Apply NoData mask + mask = src.read_masks(1) > 0 + xs, ys, zs = xs[mask], ys[mask], zs[mask] + data = {"X": pd.Series(xs.ravel()), + "Y": pd.Series(ys.ravel()), + "Z": pd.Series(zs.ravel())} + df = pd.DataFrame(data=data) + geometry = gpd.points_from_xy(df.X, df.Y) + gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry) + return gdf + + +raster_gdf = raster_points() + +shape_file = "" +shp_gdf = gpd.read_file(shape_file) +gpd.overlay(raster_gdf, shp_gdf, how="intersection") +print(raster_gdf.head()) + From c4e2a684865239a74d418b3ab76e3b621b111a57 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Fri, 2 Dec 2022 17:24:58 +1100 Subject: [PATCH 19/33] minor changes in config and retained data during training --- uncoverml/config.py | 2 +- uncoverml/features.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/uncoverml/config.py b/uncoverml/config.py index ed1dee90..bff5c28c 100644 --- a/uncoverml/config.py +++ b/uncoverml/config.py @@ -286,6 +286,7 @@ def __init__(self, yaml_file): self.split_oos_fraction = s['targets']['split']['oos_fraction'] self.train_shapefile = Path(self.output_dir).joinpath(s['output']['train_shapefile']) self.oos_shapefile = Path(self.output_dir).joinpath(s['output']['oos_shapefile']) + self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp') self.mask = None if 'mask' in s: @@ -380,7 +381,6 @@ def __init__(self, yaml_file): else self.name + ('.cluster' if self.clustering else '.model') self.model_file = Path(self.output_dir).joinpath(output_model) - self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp') self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv') self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv') self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json") diff --git a/uncoverml/features.py b/uncoverml/features.py index bcbe4880..3d7fcebc 100644 --- a/uncoverml/features.py +++ b/uncoverml/features.py @@ -170,9 +170,15 @@ def cull_all_null_rows(feature_sets): bool_transformed_vectors = np.concatenate([t.mask for t in transformed_vectors], axis=1) - covaraiates = bool_transformed_vectors.shape[1] - rows_to_keep = np.sum(bool_transformed_vectors, axis=1) != covaraiates - return rows_to_keep + data_transformed_vectors = np.concatenate([t.data for t in + transformed_vectors], axis=1) + num_covariates = bool_transformed_vectors.shape[1] + + rows_with_at_least_one_cov_unmasked = np.sum(bool_transformed_vectors, axis=1) != num_covariates + # good rows are any covariate unmasked and all finite covariates + rows_with_all_finite_covariates = np.isfinite(data_transformed_vectors).sum(axis=1) == num_covariates + good_rows = rows_with_all_finite_covariates & rows_with_at_least_one_cov_unmasked + return good_rows def gather_features(x, node=None): From 69951f7f7455133e2a296467e8f43d6c394a11cc Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 6 Dec 2022 00:17:24 +1100 Subject: [PATCH 20/33] pcas implemented --- configs/reference_pca.yaml | 3 ++- uncoverml/config.py | 2 ++ uncoverml/predict.py | 9 ++++++--- uncoverml/scripts/uncoverml.py | 27 ++++++++++++++++++++++----- uncoverml/transforms/linear.py | 2 +- 5 files changed, 33 insertions(+), 10 deletions(-) diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml index aea2c3c9..8abeb833 100644 --- a/configs/reference_pca.yaml +++ b/configs/reference_pca.yaml @@ -18,7 +18,8 @@ features: preprocessing: transforms: - whiten: - n_components: 4 + n_components: 2 +# keep_fraction: 0.2 # one of keep fraction or n_components allowed pca: geotif: diff --git a/uncoverml/config.py b/uncoverml/config.py index bff5c28c..c226f810 100644 --- a/uncoverml/config.py +++ b/uncoverml/config.py @@ -298,6 +298,8 @@ def __init__(self, yaml_file): preprocessing_transforms = s['preprocessing']['transforms'] if 'n_components' in preprocessing_transforms[0]['whiten']: self.n_components = preprocessing_transforms[0]['whiten']['n_components'] + elif 'keep_fraction' in preprocessing_transforms[0]['whiten']: + self.keep_fraction = preprocessing_transforms[0]['whiten']['keep_fraction'] else: self.n_components = None if 'geotif' not in s['pca']: diff --git a/uncoverml/predict.py b/uncoverml/predict.py index 512ecdd4..f8ba2e71 100644 --- a/uncoverml/predict.py +++ b/uncoverml/predict.py @@ -13,6 +13,7 @@ from uncoverml.krige import krig_dict from uncoverml import transforms from uncoverml.config import Config +from uncoverml.hyopt import NpEncoder log = logging.getLogger(__name__) float32finfo = np.finfo(dtype=np.float32) @@ -217,17 +218,19 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf def export_pca(subchunk, image_out: geoio.ImageWriter, config: Config): x, _ = _get_data(subchunk, config) - mpiops.run_once(export_pca_fractions, config) total_gb = mpiops.comm.allreduce(x.nbytes / 1e9) log.info("Loaded {:2.4f}GB of image data".format(total_gb)) log.info("Extracting PCAs....") + # reverse x along columns which are eigen vectors so that the eigenvector corresponding to the largest eigenvalue + # is the first PC. This will hopefully remove some ambiguity image_out.write(np.flip(x, axis=1), subchunk) -def export_pca_fractions(config): +def export_pca_fractions(config: Config): whiten_transform = config.final_transform.global_transforms[0] with open(config.pca_json, 'w') as output_file: - json.dump(whiten_transform.explained_ratio, output_file, sort_keys=True, indent=4) + json.dump(whiten_transform.__dict__, output_file, sort_keys=True, indent=4, cls=NpEncoder) + log.info(f"wrote pc contributions/whiten transform stats in {config.pca_json}") def cluster_analysis(x, y, partition_no, config, feature_names): diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index f40c1352..27939d39 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -390,14 +390,20 @@ def predict(model_or_cluster_file, partitions, mask, retain): @click.argument('pipeline_file') @click.option('-p', '--partitions', type=int, default=1, help='divide each node\'s data into this many partitions') +@click.option('-s', '--subsample_fraction', type=float, default=1.0, + help='only use this fraction of the data for learning classes') @click.option('-m', '--mask', type=str, default='', help='mask file used to limit prediction area') @click.option('-r', '--retain', type=int, default=None, help='mask values where to predict') -def pca(pipeline_file, partitions, mask, retain): +def pca(pipeline_file, partitions, subsample_fraction, mask, retain): config = ls.config.Config(pipeline_file) + + __validate_pca_config(config) # no other transforms other than whiten assert config.pca, "Not a pca analysis. Please include the pca block in your yaml!!" config.mask = mask if mask else config.mask + + config.subsample_fraction = subsample_fraction if config.mask: config.retain = retain if retain else config.retain @@ -413,8 +419,20 @@ def pca(pipeline_file, partitions, mask, retain): else: log.info("Using memory aggressively: dividing all data between nodes") - image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(config.n_components, config) - tr_whiten = __validate_pca_config(config) + # Get the image chunks and their associated transforms + image_chunk_sets = ls.geoio.unsupervised_feature_sets(config) + transform_sets = [k.transform_set for k in config.feature_sets] + + _ = ls.features.transform_features(image_chunk_sets, + transform_sets, + config.final_transform, + config) + log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data") + ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config) + ls.mpiops.run_once(ls.predict.export_pca_fractions, config) + + whiten_transform = config.final_transform.global_transforms[0] + image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(whiten_transform.keepdims, config) outfile_tif = config.name + "_pca" image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs, @@ -422,10 +440,8 @@ def pca(pipeline_file, partitions, mask, retain): config.n_subchunks, config.output_dir, band_tags=[f'_pc_{n}' for n in range(1, config.n_components+1)], **config.geotif_options) - for i in range(config.n_subchunks): log.info("starting to render partition {}".format(i+1)) - # TODO: ideally want to take a random sample of each covariate and compute whiten stats ls.predict.export_pca(i, image_out, config) # explicitly close output rasters @@ -435,6 +451,7 @@ def pca(pipeline_file, partitions, mask, retain): def __validate_pca_config(config): + # assert no other transforms other than whiten transform_sets = [k.transform_set for k in config.feature_sets] for t in transform_sets: assert len(t.image_transforms) == 0 # validation that there are no image or global transforms diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py index 2fc0e7b8..343bd663 100644 --- a/uncoverml/transforms/linear.py +++ b/uncoverml/transforms/linear.py @@ -90,7 +90,7 @@ def __init__(self, keep_fraction=None, n_components=None): if keep_fraction is None: assert n_components is not None assert (keep_fraction is None) or (n_components is None) - self.explained_ratio = None + self.explained_ratio = {} def __call__(self, x): x = x.astype(float) From 0dfeff7e1299fdc925629ed2205a0b60b940ba86 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 6 Dec 2022 00:48:26 +1100 Subject: [PATCH 21/33] some helpful logs --- uncoverml/geoio.py | 5 +++-- uncoverml/scripts/uncoverml.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py index 0850c077..03c65d95 100644 --- a/uncoverml/geoio.py +++ b/uncoverml/geoio.py @@ -537,8 +537,9 @@ def f(image_source): patchsize=config.patchsize) if frac < 1.0: np.random.seed(1) - r = r[np.random.rand(r.shape[0]) < frac] - return r + rr = r[np.random.rand(r.shape[0]) < frac] + log.info(f"sampled {rr.shape[0]} from max possible {r.shape[0]}") + return rr result = _iterate_sources(f, config) return result diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index 27939d39..a748c501 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -423,10 +423,12 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain): image_chunk_sets = ls.geoio.unsupervised_feature_sets(config) transform_sets = [k.transform_set for k in config.feature_sets] - _ = ls.features.transform_features(image_chunk_sets, + features, _ = ls.features.transform_features(image_chunk_sets, transform_sets, config.final_transform, config) + log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of" + f" {features.shape[0]} points from the rasters") log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data") ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config) ls.mpiops.run_once(ls.predict.export_pca_fractions, config) From bbb603bc6f2272c08c0fa85242af196abc575f33 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 6 Dec 2022 01:02:27 +1100 Subject: [PATCH 22/33] some more helpful logs --- uncoverml/scripts/uncoverml.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index a748c501..c7d632a8 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -427,8 +427,10 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain): transform_sets, config.final_transform, config) + print(f"process {ls.mpiops.chunk_index} has {features.shape[0]} samples") + num_samples = np.sum(ls.mpiops.comm.gather(features.shape[0])) log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of" - f" {features.shape[0]} points from the rasters") + f" {num_samples} points from the rasters") log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data") ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config) ls.mpiops.run_once(ls.predict.export_pca_fractions, config) From 4a9c899c850f676335eaaffe0eda19f841dbdfd0 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 6 Dec 2022 07:27:29 +1100 Subject: [PATCH 23/33] some more useful logs --- uncoverml/transforms/linear.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py index 343bd663..ebf3e4e3 100644 --- a/uncoverml/transforms/linear.py +++ b/uncoverml/transforms/linear.py @@ -1,7 +1,8 @@ - +import logging import numpy as np from uncoverml import mpiops +log = logging.getLogger(__name__) class CentreTransform: @@ -96,6 +97,9 @@ def __call__(self, x): x = x.astype(float) if self.mean is None or self.eigvals is None or self.eigvecs is None: self.mean = mpiops.mean(x) + num_points = np.sum(mpiops.comm.gather(x.shape[0])) + num_covariates = x.shape[1] + log.info(f"Matrix for eigenvalue decomposition has shape {num_points, num_covariates}") self.eigvals, self.eigvecs = mpiops.eigen_decomposition(x) ndims = x.shape[1] # make sure 1 <= keepdims <= ndims From 4bd8490f67c4535b49f0606d36d8bbac365d9df3 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 6 Dec 2022 08:10:12 +1100 Subject: [PATCH 24/33] variation fraction support implemented for pca analysis --- configs/reference_pca.yaml | 5 +++-- uncoverml/config.py | 2 ++ uncoverml/scripts/uncoverml.py | 2 +- uncoverml/transforms/linear.py | 26 +++++++++++++++----------- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml index 8abeb833..284b26fd 100644 --- a/configs/reference_pca.yaml +++ b/configs/reference_pca.yaml @@ -18,8 +18,9 @@ features: preprocessing: transforms: - whiten: - n_components: 2 -# keep_fraction: 0.2 # one of keep fraction or n_components allowed +# n_components: 2 +# keep_fraction: 0.2 # one of keep fraction or n_components or variation_fraction allowed + variation_fraction: 0.98 pca: geotif: diff --git a/uncoverml/config.py b/uncoverml/config.py index c226f810..3cf47323 100644 --- a/uncoverml/config.py +++ b/uncoverml/config.py @@ -300,6 +300,8 @@ def __init__(self, yaml_file): self.n_components = preprocessing_transforms[0]['whiten']['n_components'] elif 'keep_fraction' in preprocessing_transforms[0]['whiten']: self.keep_fraction = preprocessing_transforms[0]['whiten']['keep_fraction'] + elif 'variation_fraction' in preprocessing_transforms[0]['whiten']: + self.variation_fraction = preprocessing_transforms[0]['whiten']['variation_fraction'] else: self.n_components = None if 'geotif' not in s['pca']: diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index c7d632a8..71e4e6ee 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -442,7 +442,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain): image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs, outfile_tif, config.n_subchunks, config.output_dir, - band_tags=[f'_pc_{n}' for n in range(1, config.n_components+1)], + band_tags=[f'_pc_{n}' for n in range(1, whiten_transform.keepdims+1)], **config.geotif_options) for i in range(config.n_subchunks): log.info("starting to render partition {}".format(i+1)) diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py index ebf3e4e3..0017da2f 100644 --- a/uncoverml/transforms/linear.py +++ b/uncoverml/transforms/linear.py @@ -80,17 +80,16 @@ def __call__(self, *args): class WhitenTransform: - def __init__(self, keep_fraction=None, n_components=None): + def __init__(self, keep_fraction=None, n_components=None, variation_fraction=None): self.mean = None self.eigvals = None self.eigvecs = None self.keep_fraction = keep_fraction self.n_components = n_components - if n_components is None: - assert keep_fraction is not None - if keep_fraction is None: - assert n_components is not None - assert (keep_fraction is None) or (n_components is None) + self.variation_fraction = variation_fraction + null_check = [x is None for x in [keep_fraction, n_components, variation_fraction]] + assert np.sum(list(null_check)) == 2, "only one of keep_fraction, n_components, " \ + "variation_fraction can be non null" self.explained_ratio = {} def __call__(self, x): @@ -103,16 +102,21 @@ def __call__(self, x): self.eigvals, self.eigvecs = mpiops.eigen_decomposition(x) ndims = x.shape[1] # make sure 1 <= keepdims <= ndims - if self.n_components is None: + self.explained_ratio = { + ndims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-ndims:])/np.sum(np.abs( + self.eigvals))) + } + + if self.n_components is None and self.variation_fraction is None: self.keepdims = min(max(1, int(ndims * self.keep_fraction)), ndims) + elif self.n_components is None and self.keep_fraction is None: + explained_ratio_cum_sum = np.cumsum(list(self.explained_ratio.values())[::-1]) + self.keepdims = np.searchsorted(explained_ratio_cum_sum > self.variation_fraction*100, True) + 1 + log.info(f"Variation percentage {self.variation_fraction*100} requires only {self.keepdims} PCs") else: self.keepdims = self.n_components assert self.n_components < ndims, "More components demanded than features. Not possible! \n " \ "Please reduce n_components or increase the number of features" - self.explained_ratio = { - self.keepdims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-self.keepdims:])/np.sum(np.abs( - self.eigvals))) - } mat = self.eigvecs[:, - self.keepdims:] vec = self.eigvals[np.newaxis, - self.keepdims:] From 00d0f7d82bbf5da1c8abe186a884bfed577c0517 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Mon, 12 Dec 2022 18:09:44 +1100 Subject: [PATCH 25/33] pca writeout was missing apply masked functionality --- configs/reference_pca.yaml | 8 +++++++- uncoverml/features.py | 4 +++- uncoverml/mpiops.py | 7 ++++++- uncoverml/predict.py | 3 ++- uncoverml/scripts/uncoverml.py | 7 +++---- 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml index 284b26fd..46b76757 100644 --- a/configs/reference_pca.yaml +++ b/configs/reference_pca.yaml @@ -4,7 +4,7 @@ memory_fraction: 0.5 features: - name: my continuous features - type: continuous + type: ordinal files: - path: configs/data/sirsam/er_depg.tif - path: configs/data/sirsam/sagawet_b_sir.tif @@ -14,6 +14,8 @@ features: - path: configs/data/sirsam/relief_apsect.tif - path: configs/data/sirsam/LATITUDE_GRID1.tif - path: configs/data/sirsam/LONGITUDE_GRID1.tif + imputation: nn +# nodes: 5000 preprocessing: transforms: @@ -22,6 +24,10 @@ preprocessing: # keep_fraction: 0.2 # one of keep fraction or n_components or variation_fraction allowed variation_fraction: 0.98 +mask: + file: /path/to/GA_data/GA-cover2/mask/old_mask_test.tif + retain: 1 + pca: geotif: TILED: YES diff --git a/uncoverml/features.py b/uncoverml/features.py index 3d7fcebc..ffa16da0 100644 --- a/uncoverml/features.py +++ b/uncoverml/features.py @@ -87,7 +87,9 @@ def transform_features(feature_sets, transform_sets, final_transform, config): if mpiops.chunk_index == 0 and config.pickle: log.info('Saving featurevec for reuse') pickle.dump(feature_vec, open(config.featurevec, 'wb')) - + if mpiops.chunk_index == 0: + for i, f in enumerate(feature_sets[0].keys()): + log.debug(f"Using feature num {i} from: {f}") x = np.ma.concatenate(transformed_vectors, axis=1) if config.cubist or config.multicubist or config.krige: log.warning("{}: Ignoring preprocessing " diff --git a/uncoverml/mpiops.py b/uncoverml/mpiops.py index ad17814f..5c8e37a4 100644 --- a/uncoverml/mpiops.py +++ b/uncoverml/mpiops.py @@ -146,7 +146,11 @@ def outer(x): out = comm.allreduce(x_outer_local) still_masked = np.ma.count_masked(out) if still_masked != 0: - log.info('Reported out: ' + ', '.join([str(s) for s in out])) + log.info("=========still_masked =================") + if chunk_index == 0: + for s in out: + log.info(s) + # log.info('Reported out: ' + ', '.join([str(s) for s in out])) raise ValueError("Can't compute outer product:" " completely missing columns!") if hasattr(out, 'mask'): @@ -161,6 +165,7 @@ def covariance(x): def eigen_decomposition(x): + log.info(f"Eigenvalue matrix shape in process {chunk_index}: {x.shape}") eigvals, eigvecs = np.linalg.eigh(covariance(x)) return eigvals, eigvecs diff --git a/uncoverml/predict.py b/uncoverml/predict.py index f8ba2e71..b3ead3a7 100644 --- a/uncoverml/predict.py +++ b/uncoverml/predict.py @@ -223,7 +223,8 @@ def export_pca(subchunk, image_out: geoio.ImageWriter, config: Config): log.info("Extracting PCAs....") # reverse x along columns which are eigen vectors so that the eigenvector corresponding to the largest eigenvalue # is the first PC. This will hopefully remove some ambiguity - image_out.write(np.flip(x, axis=1), subchunk) + x = apply_masked(lambda _x: _x, np.flip(x, axis=1)) + image_out.write(x, subchunk) def export_pca_fractions(config: Config): diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index 71e4e6ee..9f7af6ec 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -414,8 +414,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain): config.n_subchunks = partitions if config.n_subchunks > 1: - log.info("Memory contstraint forcing {} iterations " - "through data".format(config.n_subchunks)) + log.info("Memory contstraint forcing {} iterations through data".format(config.n_subchunks)) else: log.info("Using memory aggressively: dividing all data between nodes") @@ -427,7 +426,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain): transform_sets, config.final_transform, config) - print(f"process {ls.mpiops.chunk_index} has {features.shape[0]} samples") + features, _ = ls.features.remove_missing(features) num_samples = np.sum(ls.mpiops.comm.gather(features.shape[0])) log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of" f" {num_samples} points from the rasters") @@ -442,7 +441,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain): image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs, outfile_tif, config.n_subchunks, config.output_dir, - band_tags=[f'_pc_{n}' for n in range(1, whiten_transform.keepdims+1)], + band_tags=[f'pc_{n}' for n in range(1, whiten_transform.keepdims+1)], **config.geotif_options) for i in range(config.n_subchunks): log.info("starting to render partition {}".format(i+1)) From 0686d0c6f266b49d1a85dbe2241b8dcacf001000 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Tue, 13 Dec 2022 06:51:37 +1100 Subject: [PATCH 26/33] remove all masked area treatment --- uncoverml/predict.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/uncoverml/predict.py b/uncoverml/predict.py index b3ead3a7..d8ae5231 100644 --- a/uncoverml/predict.py +++ b/uncoverml/predict.py @@ -144,16 +144,16 @@ def _get_data(subchunk, config): # NOTE: This returns an *untransformed* x, # which is ok as we just need dummies here - if config.mask: - mask_x = _mask(subchunk, config) - all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x)) - if all_mask_x.shape[0] == np.sum(all_mask_x.mask): - x = np.ma.zeros((mask_x.shape[0], len(features_names)), - dtype=np.bool) - x.mask = True - log.info('Partition {} covariates are not loaded as ' - 'the partition is entirely masked.'.format(subchunk + 1)) - return x, features_names + # if config.mask: + # mask_x = _mask(subchunk, config) + # all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x)) + # if all_mask_x.shape[0] == np.sum(all_mask_x.mask): + # x = np.ma.zeros((mask_x.shape[0], len(features_names)), + # dtype=np.bool) + # x.mask = True + # log.info('Partition {} covariates are not loaded as ' + # 'the partition is entirely masked.'.format(subchunk + 1)) + # return x, features_names transform_sets = [k.transform_set for k in config.feature_sets] extracted_chunk_sets = geoio.image_subchunks(subchunk, config) From 81e7cd5d4af5d909b7c944a22050d75f6f0761ca Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Thu, 2 Feb 2023 18:19:47 +1100 Subject: [PATCH 27/33] some funny script to smooth elevation --- scripts/smooth_elevation.py | 64 +++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 scripts/smooth_elevation.py diff --git a/scripts/smooth_elevation.py b/scripts/smooth_elevation.py new file mode 100644 index 00000000..32dccbda --- /dev/null +++ b/scripts/smooth_elevation.py @@ -0,0 +1,64 @@ +from pathlib import Path +import numpy as np +from scipy.interpolate import interp2d, RegularGridInterpolator +import rasterio as rio +from rasterio.io import DatasetReader +smooth_dem = Path("/home/sudipta/Documents/nci/smooth_dem") +elevation = smooth_dem.joinpath("elevation_small.tif") +ceno = smooth_dem.joinpath("ceno_small.tif") +relief = smooth_dem.joinpath("relief_small.tif") + +with rio.open(elevation) as geotif: + profile: rio.profiles.Profile = geotif.profile + ele: DatasetReader = geotif.read(1, masked=True) + +with rio.open(relief) as geotif: + rel: DatasetReader = geotif.read(1, masked=True) + +with rio.open(ceno) as geotif: + cen: DatasetReader = geotif.read(1, masked=True) + +ele_mod = np.ma.array(data=ele.data, mask=(ele.mask | ((cen.data > 0.01) & (rel.data > 2.3))), fill_value=np.nan) +ele_mod.data[ele_mod.mask] = np.nan +# ele_mod.mask = ele.mask | ((cen.data > 0.01) & (rel.data > 0.01)) + + +def points_removed_interp(): + z = ele_mod + x, y = np.mgrid[0:z.shape[0], 0:z.shape[1]] + x1 = x[~z.mask] + y1 = y[~z.mask] + z1 = z[~z.mask] + print(z1.shape) + return interp2d(x1, y1, z1, kind="linear")(np.arange(z.shape[0]), np.arange(z.shape[1])) + + +def recatangular_interp(): + z = ele_mod.filled(np.nan) + x = np.array(range(ele_mod.shape[0])) + y = np.array(range(ele_mod.shape[1])) + zinterp = RegularGridInterpolator((x, y), z, method="linear") + X2, Y2 = np.meshgrid(x, y) + newpoints = np.array((X2, Y2)).T + + # actual interpolation + z2 = zinterp(newpoints) + z2_masked = np.ma.array(z2, mask=np.isnan(z2)) + return z2_masked + + +profile.update(compress='lzw') + +with rio.open(smooth_dem.joinpath("smooth_elevation_small_masked_ref_2p3.tif"), mode="w", **profile,) as update_dataset: + # out = recatangular_interp() + # out = points_removed_interp() + print(profile) + update_dataset.write(ele_mod, 1) + + +# import IPython; IPython.embed(); import sys; sys.exit() +# flat_ele = interp2d(x1, y1, z1) +# (np.arange(ele_mod.shape[0]), np.arange(ele_mod.shape[1])) + + + From cea9660f86e076d11b51be3ed01c58fcd1bdd74d Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Thu, 2 Feb 2023 18:20:08 +1100 Subject: [PATCH 28/33] bugfix --- uncoverml/predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uncoverml/predict.py b/uncoverml/predict.py index d8ae5231..1cc8d529 100644 --- a/uncoverml/predict.py +++ b/uncoverml/predict.py @@ -208,7 +208,7 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config)) except ValueError as v: log.warning(v) - x = _fix_for_corrupt_data(x, features_names) + x = _fix_for_corrupt_data(x, feature_names) y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config)) if config.cluster and config.cluster_analysis: cluster_analysis(x, y_star, subchunk, config, feature_names) From eaf39ab2090ca29ce327da016b3f1123893fb5a7 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Thu, 2 Feb 2023 18:20:49 +1100 Subject: [PATCH 29/33] wip - prediction on a different extent --- configs/ref_rf.yaml | 1 + uncoverml/config.py | 15 +++++++++------ uncoverml/geoio.py | 13 +++++++++---- uncoverml/scripts/uncoverml.py | 5 ++++- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml index 2f3d8c89..cbac2e59 100644 --- a/configs/ref_rf.yaml +++ b/configs/ref_rf.yaml @@ -71,6 +71,7 @@ learning: prediction: +# prediction_template: configs/data/sirsam/dem_foc2.tif quantiles: 0.95 outbands: 4 diff --git a/uncoverml/config.py b/uncoverml/config.py index 3cf47323..8aa8bbf2 100644 --- a/uncoverml/config.py +++ b/uncoverml/config.py @@ -164,15 +164,18 @@ def __init__(self, yaml_file): self.krige = self.algorithm == 'krige' if 'prediction' in s: - self.quantiles = s['prediction']['quantiles'] - self.geotif_options = s['prediction']['geotif'] if 'geotif' in \ - s['prediction'] else {} + prediction = s["prediction"] + self.quantiles = prediction['quantiles'] + self.geotif_options = prediction['geotif'] if 'geotif' in prediction else {} self.outbands = None - if 'outbands' in s['prediction']: - self.outbands = s['prediction']['outbands'] - self.thumbnails = s['prediction']['thumbnails'] \ + if 'outbands' in prediction: + self.outbands = prediction['outbands'] + self.thumbnails = prediction['thumbnails'] \ if 'thumbnails' in s['prediction'] else 10 + self.prediction_template = prediction["prediction_template"] \ + if "prediction_template" in prediction else None + if 'features' in s: self.pickle = any(True for d in s['features'] if d['type'] == 'pickle') else: diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py index 03c65d95..1a37547e 100644 --- a/uncoverml/geoio.py +++ b/uncoverml/geoio.py @@ -284,14 +284,17 @@ def load_targets(shapefile, targetfield, conf: Config): return targets -def get_image_spec(model, config): +def get_image_spec(model, config: Config): # temp workaround, we should have an image spec to check against nchannels = len(model.get_predict_tags()) return get_image_spec_from_nchannels(nchannels, config) -def get_image_spec_from_nchannels(nchannels, config): - imagelike = config.feature_sets[0].files[0] +def get_image_spec_from_nchannels(nchannels, config: Config): + if config.prediction_template: + imagelike = Path(config.prediction_template).absolute() + else: + imagelike = config.feature_sets[0].files[0] template_image = image.Image(RasterioImageSource(imagelike)) eff_shape = template_image.patched_shape(config.patchsize) + (nchannels,) eff_bbox = template_image.patched_bbox(config.patchsize) @@ -476,9 +479,11 @@ def f(image_source): return result -def image_subchunks(subchunk_index, config): +def image_subchunks(subchunk_index, config: Config): + """This is used in prediction only""" def f(image_source): + # pred_image_source = r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize) return r result = _iterate_sources(f, config) diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index 9f7af6ec..14d21832 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -326,7 +326,9 @@ def validate(pipeline_file, model_or_cluster_file, partitions): help='mask file used to limit prediction area') @click.option('-r', '--retain', type=int, default=None, help='mask values where to predict') -def predict(model_or_cluster_file, partitions, mask, retain): +@click.option('-t', '--prediction_template', type=click.Path(exists=True), default=None, + help='mask values where to predict') +def predict(model_or_cluster_file, partitions, mask, retain, prediction_template): with open(model_or_cluster_file, 'rb') as f: state_dict = joblib.load(f) @@ -336,6 +338,7 @@ def predict(model_or_cluster_file, partitions, mask, retain): config.cluster = True if splitext(model_or_cluster_file)[1] == '.cluster' \ else False config.mask = mask if mask else config.mask + config.prediction_template = prediction_template if prediction_template else config.prediction_template if config.mask: config.retain = retain if retain else config.retain From 79510bff201801ffa8c8dfcccf9379eb97ae8e38 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Wed, 8 Feb 2023 14:18:40 +1100 Subject: [PATCH 30/33] suppress wanrings --- uncoverml/scripts/uncoverml.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index 14d21832..dd396e34 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -14,6 +14,10 @@ from pathlib import Path import warnings +warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action='ignore', category=DeprecationWarning) +warnings.filterwarnings(action='ignore', category=Warning) + import click import numpy as np import matplotlib @@ -38,9 +42,6 @@ log = logging.getLogger(__name__) # warnings.showwarning = warn_with_traceback -warnings.filterwarnings(action='ignore', category=FutureWarning) -warnings.filterwarnings(action='ignore', category=DeprecationWarning) -warnings.filterwarnings(action='ignore', category=Warning) @click.group() From 340adaab2d0437776a255d77a99dcfc09c94349c Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Wed, 8 Feb 2023 14:20:18 +1100 Subject: [PATCH 31/33] working - prediction on a different extent --- uncoverml/features.py | 3 ++- uncoverml/geoio.py | 48 ++++++++++++++++++++++++++----------------- uncoverml/image.py | 3 ++- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/uncoverml/features.py b/uncoverml/features.py index ffa16da0..dd0cd2e9 100644 --- a/uncoverml/features.py +++ b/uncoverml/features.py @@ -11,11 +11,12 @@ from uncoverml import patch from uncoverml import transforms from uncoverml.config import Config +from uncoverml.geoio import RasterioImageSource log = logging.getLogger(__name__) -def extract_subchunks(image_source, subchunk_index, n_subchunks, patchsize): +def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize): equiv_chunks = n_subchunks * mpiops.chunks equiv_chunk_index = mpiops.chunks*subchunk_index + mpiops.chunk_index image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize) diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py index 1a37547e..ceeb3fd6 100644 --- a/uncoverml/geoio.py +++ b/uncoverml/geoio.py @@ -1,4 +1,5 @@ from __future__ import division +from typing import Optional import joblib import os.path from pathlib import Path @@ -78,12 +79,17 @@ def crs(self): class RasterioImageSource(ImageSource): - def __init__(self, filename): + def __init__(self, filename, template_filename: Optional[str] = None): self.filename = filename + self.template = template_filename assert os.path.isfile(filename), '{} does not exist'.format(filename) + + template_geotif = rasterio.open(template_filename, 'r') if self.template else None + with rasterio.open(self.filename, 'r') as geotiff: - self._full_res = (geotiff.width, geotiff.height, geotiff.count) + self._full_res = (geotiff.width, geotiff.height, geotiff.count) if template_geotif is None else \ + (template_geotif.width, template_geotif.height, template_geotif.count) self._nodata_value = geotiff.meta['nodata'] # we don't support different channels with different dtypes for d in geotiff.dtypes[1:]: @@ -93,7 +99,7 @@ def __init__(self, filename): self._dtype = np.dtype(geotiff.dtypes[0]) self._crs = geotiff.crs - A = geotiff.transform + A = geotiff.transform if template_geotif is None else template_geotif.transform # No shearing or rotation allowed!! if not ((A[1] == 0) and (A[3] == 0)): raise RuntimeError("Transform to pixel coordinates" @@ -428,7 +434,7 @@ def output_thumbnails(self, ratio=10): resample(f, output_tif=thumbnail, ratio=ratio) -def feature_names(config): +def feature_names(config: Config): results = [] for s in config.feature_sets: @@ -444,24 +450,14 @@ def feature_names(config): def _iterate_sources(f, config): results = [] + template_tif = config.prediction_template for s in config.feature_sets: extracted_chunks = {} for tif in s.files: name = os.path.abspath(tif) - image_source = RasterioImageSource(tif) + image_source = RasterioImageSource(tif, template_filename=template_tif) x = f(image_source) - # TODO this may hurt performance. Consider removal - if type(x) is np.ma.MaskedArray: - count = mpiops.count(x) - # if not np.all(count > 0): - # s = ("{} has no data in at least one band.".format(name) + - # " Valid_pixel_count: {}".format(count)) - # raise ValueError(s) - missing_percent = missing_percentage(x) - t_missing = mpiops.comm.allreduce( - missing_percent) / mpiops.chunks - log.info("{}: {}px {:2.2f}% missing".format( - name, count, t_missing)) + log_missing_percentage(name, x) extracted_chunks[name] = x extracted_chunks = OrderedDict(sorted( extracted_chunks.items(), key=lambda t: t[0])) @@ -470,6 +466,21 @@ def _iterate_sources(f, config): return results +def log_missing_percentage(name, x): + # TODO this may hurt performance. Consider removal + if type(x) is np.ma.MaskedArray: + count = mpiops.count(x) + # if not np.all(count > 0): + # s = ("{} has no data in at least one band.".format(name) + + # " Valid_pixel_count: {}".format(count)) + # raise ValueError(s) + missing_percent = missing_percentage(x) + t_missing = mpiops.comm.allreduce( + missing_percent) / mpiops.chunks + log.info("{}: {}px {:2.2f}% missing".format( + name, count, t_missing)) + + def image_resolutions(config): def f(image_source): r = image_source._full_res @@ -482,8 +493,7 @@ def f(image_source): def image_subchunks(subchunk_index, config: Config): """This is used in prediction only""" - def f(image_source): - # pred_image_source = + def f(image_source: RasterioImageSource): r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize) return r result = _iterate_sources(f, config) diff --git a/uncoverml/image.py b/uncoverml/image.py index 395c23c9..3ad1912e 100644 --- a/uncoverml/image.py +++ b/uncoverml/image.py @@ -2,6 +2,7 @@ import logging from affine import Affine +from uncoverml.geoio import RasterioImageSource log = logging.getLogger(__name__) @@ -27,7 +28,7 @@ def construct_splits(npixels, nchunks, overlap=0): class Image: - def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0): + def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap=0): assert chunk_idx >= 0 and chunk_idx < nchunks if nchunks == 1 and overlap != 0: From a055c1dc1631fc1995e90432f22ad79988023666 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Wed, 8 Feb 2023 17:52:52 +1100 Subject: [PATCH 32/33] bug fix cropped prediction --- uncoverml/config.py | 2 ++ uncoverml/geoio.py | 12 ++++++------ uncoverml/patch.py | 5 +++-- uncoverml/scripts/uncoverml.py | 1 + 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/uncoverml/config.py b/uncoverml/config.py index 8aa8bbf2..7e565e58 100644 --- a/uncoverml/config.py +++ b/uncoverml/config.py @@ -176,6 +176,8 @@ def __init__(self, yaml_file): self.prediction_template = prediction["prediction_template"] \ if "prediction_template" in prediction else None + self.is_prediction = False + if 'features' in s: self.pickle = any(True for d in s['features'] if d['type'] == 'pickle') else: diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py index ceeb3fd6..a57d06d2 100644 --- a/uncoverml/geoio.py +++ b/uncoverml/geoio.py @@ -82,14 +82,14 @@ class RasterioImageSource(ImageSource): def __init__(self, filename, template_filename: Optional[str] = None): self.filename = filename - self.template = template_filename assert os.path.isfile(filename), '{} does not exist'.format(filename) - template_geotif = rasterio.open(template_filename, 'r') if self.template else None + template_geotiff = rasterio.open(template_filename, 'r') if template_filename else None with rasterio.open(self.filename, 'r') as geotiff: - self._full_res = (geotiff.width, geotiff.height, geotiff.count) if template_geotif is None else \ - (template_geotif.width, template_geotif.height, template_geotif.count) + tiff = geotiff if template_geotiff is None else template_geotiff + + self._full_res = (geotiff.width, geotiff.height, geotiff.count) self._nodata_value = geotiff.meta['nodata'] # we don't support different channels with different dtypes for d in geotiff.dtypes[1:]: @@ -99,7 +99,7 @@ def __init__(self, filename, template_filename: Optional[str] = None): self._dtype = np.dtype(geotiff.dtypes[0]) self._crs = geotiff.crs - A = geotiff.transform if template_geotif is None else template_geotif.transform + A = tiff.transform # No shearing or rotation allowed!! if not ((A[1] == 0) and (A[3] == 0)): raise RuntimeError("Transform to pixel coordinates" @@ -450,7 +450,7 @@ def feature_names(config: Config): def _iterate_sources(f, config): results = [] - template_tif = config.prediction_template + template_tif = config.prediction_template if config.is_prediction else None for s in config.feature_sets: extracted_chunks = {} for tif in s.files: diff --git a/uncoverml/patch.py b/uncoverml/patch.py index 028cce5d..e7e7f4df 100644 --- a/uncoverml/patch.py +++ b/uncoverml/patch.py @@ -5,6 +5,7 @@ import numpy as np import skimage +from uncoverml.image import Image log = logging.getLogger(__name__) @@ -80,7 +81,7 @@ def point_patches(image, pwidth, points): return output -def _image_to_data(image): +def _image_to_data(image: Image): """ breaks up an image object into arrays suitable for sending to the patching functions @@ -92,7 +93,7 @@ def _image_to_data(image): return data, mask, data_dtype -def all_patches(image, patchsize): +def all_patches(image: Image, patchsize): data, mask, data_dtype = _image_to_data(image) patches = grid_patches(data, patchsize) patch_mask = grid_patches(mask, patchsize) diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index dd396e34..fd54218a 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -336,6 +336,7 @@ def predict(model_or_cluster_file, partitions, mask, retain, prediction_template model = state_dict["model"] config = state_dict["config"] + config.is_prediction = True config.cluster = True if splitext(model_or_cluster_file)[1] == '.cluster' \ else False config.mask = mask if mask else config.mask From 45941831e1d1e3c49df60eab607c52b4b7568248 Mon Sep 17 00:00:00 2001 From: Sudipta Basak Date: Thu, 9 Feb 2023 11:04:38 +1100 Subject: [PATCH 33/33] prediction on a different extent finally working? --- configs/ref_rf.yaml | 14 +++++++------- uncoverml/features.py | 6 ++++-- uncoverml/geoio.py | 22 +++++++++++++--------- uncoverml/image.py | 38 +++++++++++++++++++++++++++----------- 4 files changed, 51 insertions(+), 29 deletions(-) diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml index cbac2e59..bdfbf1c0 100644 --- a/configs/ref_rf.yaml +++ b/configs/ref_rf.yaml @@ -14,19 +14,19 @@ features: - path: configs/data/sirsam/k_15v5.tif - path: configs/data/sirsam/relief_apsect.tif transforms: - - standardise - - whiten: - keep_fraction: 0.8 +# - standardise +# - whiten: +# keep_fraction: 0.8 imputation: none preprocessing: imputation: none transforms: - - whiten: - keep_fraction: 0.8 +# - whiten: +# keep_fraction: 0.8 targets: - file: configs/data/geochem_sites.shp + file: configs/data/geochem_sites_cropped.shp property: K_ppm_imp # group_targets: # groups_eps: 0.09 @@ -71,7 +71,7 @@ learning: prediction: -# prediction_template: configs/data/sirsam/dem_foc2.tif + prediction_template: configs/data/sirsam/dem_foc2.tif quantiles: 0.95 outbands: 4 diff --git a/uncoverml/features.py b/uncoverml/features.py index dd0cd2e9..161e3804 100644 --- a/uncoverml/features.py +++ b/uncoverml/features.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from collections import OrderedDict import numpy as np import pickle @@ -16,10 +17,11 @@ log = logging.getLogger(__name__) -def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize): +def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize, + template_source: Optional[RasterioImageSource] = None): equiv_chunks = n_subchunks * mpiops.chunks equiv_chunk_index = mpiops.chunks*subchunk_index + mpiops.chunk_index - image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize) + image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize, template_source) x = patch.all_patches(image, patchsize) return x diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py index a57d06d2..1d477230 100644 --- a/uncoverml/geoio.py +++ b/uncoverml/geoio.py @@ -79,15 +79,12 @@ def crs(self): class RasterioImageSource(ImageSource): - def __init__(self, filename, template_filename: Optional[str] = None): + def __init__(self, filename): self.filename = filename assert os.path.isfile(filename), '{} does not exist'.format(filename) - template_geotiff = rasterio.open(template_filename, 'r') if template_filename else None - with rasterio.open(self.filename, 'r') as geotiff: - tiff = geotiff if template_geotiff is None else template_geotiff self._full_res = (geotiff.width, geotiff.height, geotiff.count) self._nodata_value = geotiff.meta['nodata'] @@ -99,7 +96,7 @@ def __init__(self, filename, template_filename: Optional[str] = None): self._dtype = np.dtype(geotiff.dtypes[0]) self._crs = geotiff.crs - A = tiff.transform + A = geotiff.transform # No shearing or rotation allowed!! if not ((A[1] == 0) and (A[3] == 0)): raise RuntimeError("Transform to pixel coordinates" @@ -297,7 +294,7 @@ def get_image_spec(model, config: Config): def get_image_spec_from_nchannels(nchannels, config: Config): - if config.prediction_template: + if config.prediction_template and config.is_prediction: imagelike = Path(config.prediction_template).absolute() else: imagelike = config.feature_sets[0].files[0] @@ -447,15 +444,17 @@ def feature_names(config: Config): return results -def _iterate_sources(f, config): +def _iterate_sources(f, config: Config): results = [] template_tif = config.prediction_template if config.is_prediction else None + if config.is_prediction: + log.info(f"Using prediction template {config.prediction_template}") for s in config.feature_sets: extracted_chunks = {} for tif in s.files: name = os.path.abspath(tif) - image_source = RasterioImageSource(tif, template_filename=template_tif) + image_source = RasterioImageSource(tif) x = f(image_source) log_missing_percentage(name, x) extracted_chunks[name] = x @@ -494,7 +493,12 @@ def image_subchunks(subchunk_index, config: Config): """This is used in prediction only""" def f(image_source: RasterioImageSource): - r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize) + if config.is_prediction and config.prediction_template is not None: + template_source = RasterioImageSource(config.prediction_template) + else: + template_source = None + r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize, + template_source=template_source) return r result = _iterate_sources(f, config) return result diff --git a/uncoverml/image.py b/uncoverml/image.py index 3ad1912e..0f4494d4 100644 --- a/uncoverml/image.py +++ b/uncoverml/image.py @@ -1,3 +1,4 @@ +from typing import Optional import numpy as np import logging @@ -28,7 +29,10 @@ def construct_splits(npixels, nchunks, overlap=0): class Image: - def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap=0): + def __init__(self, source: RasterioImageSource, + chunk_idx=0, nchunks=1, overlap=0, + t_source: Optional[RasterioImageSource] = None, + ): assert chunk_idx >= 0 and chunk_idx < nchunks if nchunks == 1 and overlap != 0: @@ -49,6 +53,10 @@ def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap= self.pixsize_x = source.pixsize_x self.pixsize_y = source.pixsize_y self.crs = source.crs + log.debug(f"Image full resolution : {source.full_resolution}") + log.debug(f"Image origin longitude : {source.origin_longitude}") + log.debug(f"Image origin latitude : {source.origin_latitude}") + assert self.pixsize_x > 0 assert self.pixsize_y > 0 @@ -65,21 +73,29 @@ def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap= self._pix_y_to_coords = dict(zip(pix_y, coords_y)) # exclusive y range of this chunk in full image - ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx] - self._offset = np.array([0, ymin], dtype=int) - # exclusive x range of this chunk (same for all chunks) - xmin, xmax = 0, self._full_res[0] + if t_source: + self._t_full_res = t_source.full_resolution + self._t_start_lon = t_source.origin_longitude + self._t_start_lat = t_source.origin_latitude + self._t_pixsize_x = t_source.pixsize_x + self._t_pixsize_y = t_source.pixsize_y + tymin, tymax = construct_splits(self._t_full_res[1], nchunks, overlap)[chunk_idx] + t_x_offset = np.searchsorted(self._coords_x, self._t_start_lon) + t_y_offset = np.searchsorted(self._coords_y, self._t_start_lat) + xmin, xmax = t_x_offset, t_x_offset + self._t_full_res[0] + ymin, ymax = t_y_offset + tymin, t_y_offset + tymax + self.resolution = (xmax-xmin, ymax - ymin, self._t_full_res[2]) + else: + # exclusive x range of this chunk (same for all chunks) + xmin, xmax = 0, self._full_res[0] + ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx] + self.resolution = (xmax - xmin, ymax - ymin, self._full_res[2]) + self._offset = np.array([xmin, ymin], dtype=int) assert(xmin < xmax) assert(ymin < ymax) - # get resolution of this chunk - xres = self._full_res[0] - yres = ymax - ymin - # Calculate the new values for resolution and bounding box - self.resolution = (xres, yres, self._full_res[2]) - start_bound_x, start_bound_y = self._global_pix2lonlat( np.array([[xmin, ymin]]))[0] # one past the last pixel