From afe8b9b2c11f10d6ca123b98dca0d64139e8aaba Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Tue, 1 Mar 2022 10:09:30 +1100
Subject: [PATCH 01/33] write cleaned and dropped when we are actually dropping

---
 scripts/intersect_rasters.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index f1fa77f0..917599bc 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -185,11 +185,12 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
     df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
     df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
 
-    df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
-    print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
     if df5.shape[0]:
+        df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
+        print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
         df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
     else:
+        print(f"No points dropped and there for _cleaned.shp file is not createed'")
         print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
 
 # rets = Parallel(

From ba59a2712d11c1f9dc971d598d95bade6cded34c Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Wed, 2 Mar 2022 08:27:30 +1100
Subject: [PATCH 02/33] drop duplicate values during intersect

---
 scripts/intersect_rasters.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index 917599bc..31e7fafe 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -125,10 +125,10 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count'))
         pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean()
         # import IPython; IPython.embed(); import sys; sys.exit()
-        pts_deduped = pts_mean.merge(
-            pts_count, how='inner', on=['rows', 'cols']).merge(
-            pts[['rows', 'cols', 'geometry']], how='inner', on=['rows', 'cols']
-        )
+        pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
+
+        pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'],
+                                                                                pts_deduped['POINT_Y']))
         # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols]
         coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry])
     else:

From 49623bd213b41b492abadee1ba4e905ffced2871 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Tue, 8 Mar 2022 17:24:09 +1100
Subject: [PATCH 03/33] geom cols are inserted even without dedupe

---
 scripts/intersect_rasters.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index 31e7fafe..d983228d 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -30,7 +30,7 @@ def generate_key_val(gl, shorts):
         shorts[eight_char_name] += 1
     else:
         shorts[eight_char_name] += 1
-    return Path(gl).name, eight_char_name + str(shorts[eight_char_name])
+    return gl, eight_char_name + str(shorts[eight_char_name])
 
 
 # import IPython; IPython.embed(); import sys; sys.exit()
@@ -92,11 +92,11 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
     print("====================================\n", f"intersecting {shp.as_posix()}")
     pts = gpd.read_file(shp)
     coords = np.array([(p.x, p.y) for p in pts.geometry])
+    geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+    pts = pts.merge(geom, left_index=True, right_index=True)
     if dedupe:
-        geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
-        pts = pts.merge(geom, left_index=True, right_index=True)
         tif_name = list(geotifs.keys())[0]
-        tif = data_location.joinpath(tif_name)
+        tif = Path(tif_name)
         orig_cols = pts.columns
         with rasterio.open(tif) as src:
             # resample data to target shape
@@ -135,12 +135,12 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         pts_deduped = pts
         coords_deduped = coords
 
-    for k, v in geotifs.items():
-        print(f"adding {k} to output dataframe")
-        with rasterio.open(data_location.joinpath(k)) as src:
+    for i, (k, v) in enumerate(geotifs.items()):
+        print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+        with rasterio.open(Path(k)) as src:
             pts_deduped[v] = [x[0] for x in src.sample(coords_deduped)]
 
-    pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
+    # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
     output_dir = Path('out_resampled')
     output_dir.mkdir(exist_ok=True, parents=True)
     out_shp = output_dir.joinpath(shp.name)
@@ -152,7 +152,6 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
 
 if __name__ == '__main__':
     # local
-    data_location = Path("configs/data/sirsam")
     # tif_local = data_location.joinpath('LATITUDE_GRID1.tif')
     shapefile_location = Path("configs/data")
     shp = shapefile_location.joinpath('geochem_sites.shp')
@@ -171,15 +170,19 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         k, v = generate_key_val(gl, shorts)
         geotifs[k] = v
 
-    for k, v in geotifs.items():
-        print(f"\"{k}\": \"{v}\"")
-
     # check all required files are available on disc
     for k, v in geotifs.items():
         print(f"checking if {k} exists")
-        assert data_location.joinpath(k).exists()
+        assert Path(k).exists()
+
+    print("Add under the 'intersected_features:' section in yaml")
+    print('='*100)
+    for k, v in geotifs.items():
+        print(f"\"{Path(k).name}\": \"{v}\"")
+
+    print('='*100)
 
-    out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=True)
+    out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=False)
     df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix())
     df3 = df2[list(geotifs.values())]
     df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]

From 70a03b0c7dbb0245a1c6b5c5e857117f4ac88fda Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Thu, 10 Mar 2022 16:31:12 +1100
Subject: [PATCH 04/33] write another wrapper function including clean

---
 scripts/intersect_rasters.py | 45 ++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index d983228d..13364d5d 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -150,11 +150,29 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
     return out_shp
 
 
+def intersect_sample_and_clean(shp, dedupe: bool = False):
+    out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe)
+    df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix())
+    df3 = df2[list(geotifs.values())]
+    df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
+    df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
+    try:
+        if df5.shape[0]:
+            df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
+            print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
+            df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
+        else:
+            print(f"No points dropped and there for _cleaned.shp file is not createed'")
+            print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
+    except:
+        print(f"Check this shapefile {shp}")
+
+
 if __name__ == '__main__':
     # local
     # tif_local = data_location.joinpath('LATITUDE_GRID1.tif')
     shapefile_location = Path("configs/data")
-    shp = shapefile_location.joinpath('geochem_sites.shp')
+    # shp = shapefile_location.joinpath('geochem_sites.shp')
 
     downscale_factor = 2  # keep 1 point in a 2x2 cell
 
@@ -181,22 +199,9 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         print(f"\"{Path(k).name}\": \"{v}\"")
 
     print('='*100)
-
-    out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=False)
-    df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix())
-    df3 = df2[list(geotifs.values())]
-    df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
-    df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
-
-    if df5.shape[0]:
-        df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
-        print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
-        df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
-    else:
-        print(f"No points dropped and there for _cleaned.shp file is not createed'")
-        print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
-
-# rets = Parallel(
-    #     n_jobs=-1,
-    #     verbose=100,
-    # )(delayed(intersect_and_sample_shp)(s) for s in shapefile_location.glob("*.shp"))
+    for s in shapefile_location.glob("*.shp"):
+        intersect_sample_and_clean(s, True)
+    # rets = Parallel(
+    #         n_jobs=-1,
+    #         verbose=100,
+    #     )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp"))

From 0f5a456cbcfd7661374b62109050fddf651c9878 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Thu, 10 Mar 2022 17:11:03 +1100
Subject: [PATCH 05/33] change to single shapefile multiple raster intersect

---
 scripts/intersect_rasters.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index 13364d5d..d421c880 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -135,10 +135,16 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         pts_deduped = pts
         coords_deduped = coords
 
-    for i, (k, v) in enumerate(geotifs.items()):
-        print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
-        with rasterio.open(Path(k)) as src:
-            pts_deduped[v] = [x[0] for x in src.sample(coords_deduped)]
+    # for k, v in geotifs.items():
+        # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+        # pts_deduped[v] = __extract_raster_points(coords_deduped, k)
+    pts_deduped__ = Parallel(
+            n_jobs=-1,
+            verbose=100,
+        )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys())
+
+    for i, col_name in enumerate(geotifs.values()):
+        pts_deduped[col_name] = pts_deduped__[i]
 
     # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
     output_dir = Path('out_resampled')
@@ -150,6 +156,12 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
     return out_shp
 
 
+def __extract_raster_points(raster, coords_deduped):
+    with rasterio.open(Path(raster)) as src:
+        print(f"---- intersecting {raster}--------")
+        return [x[0] for x in src.sample(coords_deduped)]
+
+
 def intersect_sample_and_clean(shp, dedupe: bool = False):
     out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe)
     df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix())

From e828de1620d833821bcb91b77c75a43f10275819 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Sat, 19 Mar 2022 14:25:22 +1100
Subject: [PATCH 06/33] default to only used shapes

---
 scripts/intersect_rasters.py | 48 +++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index d421c880..0aef9989 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -88,7 +88,7 @@ def generate_key_val(gl, shorts):
 # }
 
 
-def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False):
+def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False, intersect_parallel: bool = False):
     print("====================================\n", f"intersecting {shp.as_posix()}")
     pts = gpd.read_file(shp)
     coords = np.array([(p.x, p.y) for p in pts.geometry])
@@ -138,22 +138,22 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
     # for k, v in geotifs.items():
         # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
         # pts_deduped[v] = __extract_raster_points(coords_deduped, k)
-    pts_deduped__ = Parallel(
-            n_jobs=-1,
-            verbose=100,
-        )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys())
-
-    for i, col_name in enumerate(geotifs.values()):
-        pts_deduped[col_name] = pts_deduped__[i]
+    if intersect_parallel:
+        pts_deduped__ = Parallel(
+                n_jobs=-1,
+                verbose=100,
+            )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys())
+
+        for i, col_name in enumerate(geotifs.values()):
+            pts_deduped[col_name] = pts_deduped__[i]
+    else:
+        for i, (k, v) in enumerate(geotifs.items()):
+            print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+            pts_deduped[v] = __extract_raster_points(coords_deduped, k)
 
     # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
-    output_dir = Path('out_resampled')
-    output_dir.mkdir(exist_ok=True, parents=True)
-    out_shp = output_dir.joinpath(shp.name)
-    pts_deduped.to_file(out_shp.as_posix())
-    print(f"saved intersected shapefile at {out_shp.as_posix()}")
     # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False)
-    return out_shp
+    return pts_deduped
 
 
 def __extract_raster_points(raster, coords_deduped):
@@ -162,9 +162,12 @@ def __extract_raster_points(raster, coords_deduped):
         return [x[0] for x in src.sample(coords_deduped)]
 
 
-def intersect_sample_and_clean(shp, dedupe: bool = False):
-    out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe)
-    df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix())
+def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False,
+                               intersect_parallel: bool = False):
+    df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    output_dir.joinpath('clean')
+    out_shp = output_dir.joinpath(shp.name)
     df3 = df2[list(geotifs.values())]
     df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
     df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
@@ -172,8 +175,11 @@ def intersect_sample_and_clean(shp, dedupe: bool = False):
         if df5.shape[0]:
             df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
             print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
-            df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
+            if write_dropped:
+                df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
         else:
+            df2.to_file(out_shp.as_posix())
+            print(f"saved intersected shapefile at {out_shp.as_posix()}")
             print(f"No points dropped and there for _cleaned.shp file is not createed'")
             print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
     except:
@@ -182,10 +188,8 @@ def intersect_sample_and_clean(shp, dedupe: bool = False):
 
 if __name__ == '__main__':
     # local
-    # tif_local = data_location.joinpath('LATITUDE_GRID1.tif')
+    output_dir = Path('out_resampled')
     shapefile_location = Path("configs/data")
-    # shp = shapefile_location.joinpath('geochem_sites.shp')
-
     downscale_factor = 2  # keep 1 point in a 2x2 cell
 
     geom_cols = ['POINT_X', 'POINT_Y']
@@ -212,7 +216,7 @@ def intersect_sample_and_clean(shp, dedupe: bool = False):
 
     print('='*100)
     for s in shapefile_location.glob("*.shp"):
-        intersect_sample_and_clean(s, True)
+        intersect_sample_and_clean(s, True, True)
     # rets = Parallel(
     #         n_jobs=-1,
     #         verbose=100,

From b232364601525ac28837721b1a07ae4a739dccde Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Fri, 1 Apr 2022 13:48:10 +1100
Subject: [PATCH 07/33] adding median based filtering of majors dataset

---
 scripts/dedupe_with_median_scratch.py | 291 ++++++++++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 scripts/dedupe_with_median_scratch.py

diff --git a/scripts/dedupe_with_median_scratch.py b/scripts/dedupe_with_median_scratch.py
new file mode 100644
index 00000000..d24be9cc
--- /dev/null
+++ b/scripts/dedupe_with_median_scratch.py
@@ -0,0 +1,291 @@
+from typing import Dict
+import csv
+from collections import defaultdict
+from os import path
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import rasterio
+import geopandas as gpd
+from joblib import Parallel, delayed
+
+
+def read_list_file(list_path: str):
+    files = []
+    csvfile = path.abspath(list_path)
+    with open(csvfile, 'r') as f:
+        reader = csv.reader(f)
+        tifs = list(reader)
+        tifs = [f[0].strip() for f in tifs
+                if (len(f) > 0 and f[0].strip() and
+                    f[0].strip()[0] != '#')]
+    for f in tifs:
+        files.append(path.abspath(f))
+    return files
+
+
+def generate_key_val(gl, shorts):
+    eight_char_name = Path(gl).name[:8]
+    if eight_char_name not in shorts:
+        shorts[eight_char_name] += 1
+    else:
+        shorts[eight_char_name] += 1
+    return gl, eight_char_name + str(shorts[eight_char_name])
+
+
+# import IPython; IPython.embed(); import sys; sys.exit()
+
+
+# data_location = \
+#     Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/")
+# Read points from shapefile
+
+# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/")
+
+# geotifs = {
+#     "relief_radius4.tif": "relief4",
+#     "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred",
+#     "MvrtpLL_smooth.tif": "mrvtpll_s",
+#     "MvrtpLL_fin.tif": "mvrtpll_f",
+#     "mrvbf_9.tif": "mrvbf_9",
+#     "Rad2016K_Th.tif": "rad2016kth",
+#     "Thorium_2016.tif": "th_2016",
+#     "Mesozoic_older_raster_MEAN.tif": "meso_mean",
+#     "LOC_distance_to_coast.tif": "loc_dis",
+#     "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb",
+#     "water-85m_1.tif": "water_85m",
+#     "clim_RSM_albers.tif": "clim_rsm",
+#     "tpi_300.tif": "tpi_300",
+#     "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir",
+#     "si_geol1.tif": "si_geol1",
+#     "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay",
+#     "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr",
+#     "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl",
+#     "Gravity_land.tif": "gravity",
+#     "dem_fill.tif": "dem",
+#     "Clim_Prescott_LindaGregory.tif": "clim_linda",
+#     "slope_fill2.tif": "slopefill2",
+#     "clim_PTA_albers.tif": "clim_alber",
+#     "SagaWET9cell_M.tif": "sagawet",
+#     "ceno_euc_aust1.tif": "ceno_euc",
+#     "s2-dpca-85m_band1.tif": "s2_band1",
+#     "s2-dpca-85m_band2.tif": "s2_band2",
+#     "s2-dpca-85m_band3.tif": "s2_band3",
+#     "3dem_mag0_finn.tif": "3dem_mag0",
+#     "3dem_mag1_fin.tif": "3dem_mag1",
+#     "3dem_mag2.tif": "3dem_mag2",
+# }
+
+# geotifs = {
+#     "relief_apsect.tif": "relief4",
+#     "LATITUDE_GRID1.tif": "latitude",
+#     "LONGITUDE_GRID1.tif": "longitude",
+#     "er_depg.tif": "er_depg",
+#     "sagawet_b_sir.tif": "sagawet",
+#     "dem_foc2.tif": "dem_foc2",
+#     "outcrop_dis2.tif": "outcrop",
+#     "k_15v5.tif": "k_15v5",
+# }
+
+majors = [
+    "SIO2",
+    "TIO2",
+    "AL2O3",
+    "FE2O3TOT",
+    "FE2O3",
+    "FEO",
+    "MNO",
+    "MGO",
+    "CAO",
+    "NA2O",
+    "K2O",
+    "P2O5",
+]
+
+def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str],
+                             dedupe: bool = False,
+                             intersect_parallel: bool = False,
+                             pixel_count: bool = False,
+                             dedupe_with_median: bool = True
+                             ):
+    print("====================================\n", f"intersecting {shp.as_posix()}")
+    pts = gpd.read_file(shp)
+    coords = np.array([(p.x, p.y) for p in pts.geometry])
+    geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+    pts = pts.merge(geom, left_index=True, right_index=True)
+    if dedupe:
+        tif_name = list(geotifs.keys())[0]
+        tif = Path(tif_name)
+        # orig_cols = pts.columns
+        with rasterio.open(tif) as src:
+            # resample data to target shape
+            data = src.read(
+                out_shape=(
+                    src.count,
+                    int(src.height / downscale_factor),
+                    int(src.width / downscale_factor)
+                ),
+                resampling=rasterio.enums.Resampling.bilinear
+            )
+            # scale image transform
+            transform = src.transform * src.transform.scale(
+                (src.width / data.shape[-1]),
+                (src.height / data.shape[-2])
+            )
+            pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
+
+        # keep one only
+        # pts_deduped = pts.sort_values(
+        #     by=geom_cols, ascending=[True, True]
+        # ).groupby(by=['rows', 'cols'], as_index=False).first()[orig_cols]
+
+        # keep mean of repeated observations in a pixel
+        #
+        pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(px_count=('rows', 'count'))
+        # if dedupe_with_median:
+            # import IPython; IPython.embed(); import sys; sys.exit()
+
+
+pts_majors = pts[majors + ['rows', 'cols'] + geom_cols + ['geometry']]
+for m in majors:
+    pts_majors[m] = pts_majors[m].apply(lambda x: np.nan if x == 'NA' else float(x))
+pts_median = pts_majors[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).median()
+pts_deduped = pts_median.merge(pts_count, how='inner', on=['rows', 'cols'])
+rename_dict = {k: k + '_m' for k in majors}
+pts_deduped = pts_deduped.rename(mapper=rename_dict, axis=1)
+all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=['rows', 'cols'])
+for m in majors:
+    all_pts_with_count_and_median[m + '_distance_from_median'] = \
+        abs(all_pts_with_count_and_median[m]- all_pts_with_count_and_median[m + '_m'])/ \
+        abs(all_pts_with_count_and_median[m + '_m'])
+
+def sum_distance_from_median(x):
+    s = np.nansum([v for k, v in x.items() if k.endswith('_distance_from_median')])
+    return s
+
+all_pts_with_count_and_median['median_sum'] = all_pts_with_count_and_median.apply(lambda x: sum_distance_from_median(
+    x), axis=1)
+
+all_pts_with_count_and_median_min_median_sum = \
+    all_pts_with_count_and_median.groupby(by=['rows', 'cols'], as_index=False).agg(
+        median_sum_min=('median_sum', 'min'),
+        median_sum_std=('median_sum', 'std')
+    )
+
+all_pts = all_pts_with_count_and_median.merge(all_pts_with_count_and_median_min_median_sum, how='left', on=['rows',
+                                                                                                            'cols'])
+
+
+# all_pts['median_sum_to_median_min_diff'] = all_pts['median_sum'] - all_pts['median_sum_min']
+
+# min_pts = all_pts.groupby(by=['rows', 'cols'], as_index=False).median_sum_to_median_min_diff.agg(
+#     ['min', 'std']
+# )
+
+final_pts = all_pts[all_pts.median_sum==all_pts.median_sum_min]
+final_pts_sorted = final_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True])
+all_pts_sorted = all_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True])
+
+# all_pts.merge(min_pts, how="inner", on=['rows', 'cols', "median_sum_to_median_min_diff"])
+
+# all_pts_majors_median_filtered = pts_count.merge(all_pts[all_pts.median_sum == all_pts.median_sum_min],
+#                                                  how='left',on=['rows', 'cols'])
+
+        else:  # dedupe with mean
+            pts_mean = pts[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).mean()
+            pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
+        # import IPython; IPython.embed(); import sys; sys.exit()
+        pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'],
+                                                                                pts_deduped['POINT_Y']))
+        # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols]
+        coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry])
+    else:
+        pts_deduped = pts
+        coords_deduped = coords
+
+    # for k, v in geotifs.items():
+        # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+        # pts_deduped[v] = __extract_raster_points(coords_deduped, k)
+    if intersect_parallel:
+        pts_deduped__ = Parallel(
+                n_jobs=-1,
+                verbose=100,
+            )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys())
+
+        for i, col_name in enumerate(geotifs.values()):
+            pts_deduped[col_name] = pts_deduped__[i]
+    else:
+        for i, (k, v) in enumerate(geotifs.items()):
+            print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+            pts_deduped[v] = __extract_raster_points(coords_deduped, k)
+
+    # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
+    # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False)
+    return pts_deduped
+
+
+def __extract_raster_points(raster, coords_deduped):
+    with rasterio.open(Path(raster)) as src:
+        print(f"---- intersecting {raster}--------")
+        return [x[0] for x in src.sample(coords_deduped)]
+
+
+def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False,
+                               intersect_parallel: bool = False):
+    df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    output_dir.joinpath('clean')
+    out_shp = output_dir.joinpath(shp.name)
+    df3 = df2[list(geotifs.values())]
+    df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
+    df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
+    try:
+        if df5.shape[0]:
+            df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
+            print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
+            if write_dropped:
+                df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
+        else:
+            df2.to_file(out_shp.as_posix())
+            print(f"saved intersected shapefile at {out_shp.as_posix()}")
+            print(f"No points dropped and there for _cleaned.shp file is not createed'")
+            print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
+    except:
+        print(f"Check this shapefile {shp}")
+
+
+if __name__ == '__main__':
+    # local
+    output_dir = Path('out_resampled')
+    shapefile_location = Path("configs/data")
+    downscale_factor = 2  # keep 1 point in a 2x2 cell
+
+    geom_cols = ['POINT_X', 'POINT_Y']
+
+    covariastes_list = "configs/data/sirsam/covariates_list.txt"
+    geotifs_list = read_list_file(list_path=covariastes_list)
+    shorts = defaultdict(int)
+
+    geotifs = {}
+
+    for gl in geotifs_list:
+        k, v = generate_key_val(gl, shorts)
+        geotifs[k] = v
+
+    # check all required files are available on disc
+    for k, v in geotifs.items():
+        print(f"checking if {k} exists")
+        assert Path(k).exists()
+
+    print("Add under the 'intersected_features:' section in yaml")
+    print('='*100)
+    for k, v in geotifs.items():
+        print(f"\"{Path(k).name}\": \"{v}\"")
+
+    print('='*100)
+    for s in shapefile_location.glob("*.shp"):
+        intersect_sample_and_clean(s, True, True)
+    # rets = Parallel(
+    #         n_jobs=-1,
+    #         verbose=100,
+    #     )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp"))

From a1473f128939780e7d021515fdff0ac2be66e40a Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Sat, 9 Apr 2022 07:23:55 +1000
Subject: [PATCH 08/33] updated

---
 scripts/dedupe_with_median_scratch.py | 36 ++++++++++++++-------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/scripts/dedupe_with_median_scratch.py b/scripts/dedupe_with_median_scratch.py
index d24be9cc..e0bbc3fb 100644
--- a/scripts/dedupe_with_median_scratch.py
+++ b/scripts/dedupe_with_median_scratch.py
@@ -109,10 +109,10 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str],
                              dedupe_with_median: bool = True
                              ):
     print("====================================\n", f"intersecting {shp.as_posix()}")
-    pts = gpd.read_file(shp)
-    coords = np.array([(p.x, p.y) for p in pts.geometry])
-    geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
-    pts = pts.merge(geom, left_index=True, right_index=True)
+pts = gpd.read_file(shp)
+coords = np.array([(p.x, p.y) for p in pts.geometry])
+geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+pts = pts.merge(geom, left_index=True, right_index=True)
     if dedupe:
         tif_name = list(geotifs.keys())[0]
         tif = Path(tif_name)
@@ -137,24 +137,26 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str],
         # keep one only
         # pts_deduped = pts.sort_values(
         #     by=geom_cols, ascending=[True, True]
-        # ).groupby(by=['rows', 'cols'], as_index=False).first()[orig_cols]
+        # ).groupby(by=rows_and_cols, as_index=False).first()[orig_cols]
 
         # keep mean of repeated observations in a pixel
         #
-        pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(px_count=('rows', 'count'))
+        pts_count = pts.groupby(by=rows_and_cols, as_index=False).agg(px_count=('rows', 'count'))
         # if dedupe_with_median:
             # import IPython; IPython.embed(); import sys; sys.exit()
 
+rows_and_cols = ['rows', 'cols']
 
-pts_majors = pts[majors + ['rows', 'cols'] + geom_cols + ['geometry']]
+pts_majors = pts[majors + rows_and_cols + geom_cols + ['geometry']]
 for m in majors:
-    pts_majors[m] = pts_majors[m].apply(lambda x: np.nan if x == 'NA' else float(x))
-pts_median = pts_majors[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).median()
-pts_deduped = pts_median.merge(pts_count, how='inner', on=['rows', 'cols'])
+    pts[m] = pts[m].apply(lambda x: np.nan if x == 'NA' else float(x))
+pts_median = pts_majors[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).median()
+pts_deduped = pts_median.merge(pts_count, how='inner', on=rows_and_cols)
 rename_dict = {k: k + '_m' for k in majors}
 pts_deduped = pts_deduped.rename(mapper=rename_dict, axis=1)
-all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=['rows', 'cols'])
+all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=rows_and_cols)
 for m in majors:
+    print(m)
     all_pts_with_count_and_median[m + '_distance_from_median'] = \
         abs(all_pts_with_count_and_median[m]- all_pts_with_count_and_median[m + '_m'])/ \
         abs(all_pts_with_count_and_median[m + '_m'])
@@ -167,7 +169,7 @@ def sum_distance_from_median(x):
     x), axis=1)
 
 all_pts_with_count_and_median_min_median_sum = \
-    all_pts_with_count_and_median.groupby(by=['rows', 'cols'], as_index=False).agg(
+    all_pts_with_count_and_median.groupby(by=rows_and_cols, as_index=False).agg(
         median_sum_min=('median_sum', 'min'),
         median_sum_std=('median_sum', 'std')
     )
@@ -178,7 +180,7 @@ def sum_distance_from_median(x):
 
 # all_pts['median_sum_to_median_min_diff'] = all_pts['median_sum'] - all_pts['median_sum_min']
 
-# min_pts = all_pts.groupby(by=['rows', 'cols'], as_index=False).median_sum_to_median_min_diff.agg(
+# min_pts = all_pts.groupby(by=rows_and_cols, as_index=False).median_sum_to_median_min_diff.agg(
 #     ['min', 'std']
 # )
 
@@ -189,15 +191,15 @@ def sum_distance_from_median(x):
 # all_pts.merge(min_pts, how="inner", on=['rows', 'cols', "median_sum_to_median_min_diff"])
 
 # all_pts_majors_median_filtered = pts_count.merge(all_pts[all_pts.median_sum == all_pts.median_sum_min],
-#                                                  how='left',on=['rows', 'cols'])
+#                                                  how='left',on=rows_and_cols)
 
         else:  # dedupe with mean
-            pts_mean = pts[majors + ['rows', 'cols']].groupby(by=['rows', 'cols'], as_index=False).mean()
-            pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
+            pts_mean = pts[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).mean()
+            pts_deduped = pts_mean.merge(pts_count, how='inner', on=rows_and_cols)
         # import IPython; IPython.embed(); import sys; sys.exit()
         pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'],
                                                                                 pts_deduped['POINT_Y']))
-        # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols]
+        # pts_deduped = pts.drop_duplicates(subset=rows_and_cols)[orig_cols]
         coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry])
     else:
         pts_deduped = pts

From 39e04f4319e4ae0e53804f0b55e8885beef72c5e Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Sun, 10 Apr 2022 08:59:14 +1000
Subject: [PATCH 09/33] add interpolate shapes script used in nci for wa models

---
 scripts/interpolate_shapes.py | 63 +++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 scripts/interpolate_shapes.py

diff --git a/scripts/interpolate_shapes.py b/scripts/interpolate_shapes.py
new file mode 100644
index 00000000..83b34e8f
--- /dev/null
+++ b/scripts/interpolate_shapes.py
@@ -0,0 +1,63 @@
+from functools import partial
+from pathlib import Path
+from scipy import interpolate
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+from joblib import Parallel, delayed
+
+
+def interpolate_1d(row, thicknesses):
+    conductivities = row[conductivity_cols].values
+    f = interpolate.interp1d(thicknesses, conductivities, bounds_error=False,
+                             fill_value=(conductivities[0], conductivities[-1]), assume_sorted=True)
+    new_conductivities = f(new_thicknesses)
+    return new_conductivities
+
+
+def interpolate_shape(shp):
+    print(f"interpolating {shp}")
+    df = gpd.read_file(shp)
+    thicknesses = df.loc[0, thickness_cols].values.cumsum()
+    interpolate_1d_local = partial(interpolate_1d, thicknesses=thicknesses)
+    interpolated_conductivities = df.apply(
+        func=interpolate_1d_local,
+        axis=1,
+        raw=False,
+    )
+    df.loc[:, conductivity_cols] = np.vstack(interpolated_conductivities)
+    df.loc[:, thickness_cols] = np.tile(new_thicknesses, (df.shape[0], 1))
+    target_shp = output_dir.joinpath(Path(shp).name)
+    df.to_file(target_shp)
+    print(f"wrote interpolated shapefile {target_shp}")
+
+
+if __name__ == '__main__':
+
+    ref_df = gpd.read_file("./NT_1_albers.shp", rows=1)
+
+    shapes_to_convert = [
+        './AEM_GAB_albers.shp',
+        './Albany_1_albers_cleaned.shp',
+        './Albany_2_albers_mod_cleaned.shp',
+        './Murchison_1_albers_cleaned.shp',
+        './Murchison_2_albers_cleaned.shp',
+        './Murchison_3_albers.shp',
+        './frome_1_albers.shp',
+        './frome_2_albers.shp',
+        './frome_3_albers.shp',
+        './frome_4_albers.shp',
+        './frome_5_albers.shp',
+        './frome_6_albers.shp'
+    ]
+    output_dir = Path('interpolated')
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    conductivity_cols = [c for c in ref_df.columns if c.startswith('cond_')]
+    thickness_cols = [t for t in ref_df.columns if t.startswith('thick_')]
+    new_thicknesses = np.array(ref_df.loc[0, thickness_cols].values.cumsum(), dtype=np.float32)
+
+    rets = Parallel(
+        n_jobs=-1,
+        verbose=100,
+    )(delayed(interpolate_shape)(s) for s in shapes_to_convert)
\ No newline at end of file

From 11ed0d262491619e3265ccc0e94d7ebb008f4110 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Tue, 12 Apr 2022 15:15:31 +1000
Subject: [PATCH 10/33] paper plot

---
 scripts/plot_true_vs_prediction.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/scripts/plot_true_vs_prediction.py b/scripts/plot_true_vs_prediction.py
index 819ad888..f7f4902a 100644
--- a/scripts/plot_true_vs_prediction.py
+++ b/scripts/plot_true_vs_prediction.py
@@ -99,3 +99,25 @@ def plot_quantiles(line):
 plt.show()
 
 
+
+
+xx = line.d
+y_test = line.Prediction
+y_lower = line['Lower_quan']
+y_upper = line['Upper_quan']
+plt.plot(xx, line.y_true, "b.", markersize=10, label="Observed log10(conductivity)")
+plt.plot(xx, line.out, "g", markersize=10, linestyle='-',
+         label="Kriged log10(conductivity)")
+plt.plot(xx, y_test, color='orange', linestyle='-',
+         # marker='o',
+         label="Predicted median")
+plt.plot(xx, y_upper, "k-")
+plt.plot(xx, y_lower, "k-")
+plt.fill_between(
+    # (95362.213934-line.d).ravel()
+    (line.d).ravel(),
+    y_lower, y_upper, alpha=0.4, label='Predicted 90% interval'
+)
+plt.legend(loc='upper left')
+plt.ylabel('log10(conductivity)')
+plt.xlabel('Distance along flight line (meters)')

From 3c333e04ba3c7763ec784b547574c808eff73f8d Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks54321@users.noreply.github.com>
Date: Sat, 28 May 2022 11:10:31 +1000
Subject: [PATCH 11/33] update script

---
 scripts/intersect_rasters.py | 79 +++++++-----------------------------
 1 file changed, 15 insertions(+), 64 deletions(-)

diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index 0aef9989..a33529b7 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -33,64 +33,13 @@ def generate_key_val(gl, shorts):
     return gl, eight_char_name + str(shorts[eight_char_name])
 
 
-# import IPython; IPython.embed(); import sys; sys.exit()
-
-
-# data_location = \
-#     Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/")
-# Read points from shapefile
-
-# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/")
-
-# geotifs = {
-#     "relief_radius4.tif": "relief4",
-#     "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred",
-#     "MvrtpLL_smooth.tif": "mrvtpll_s",
-#     "MvrtpLL_fin.tif": "mvrtpll_f",
-#     "mrvbf_9.tif": "mrvbf_9",
-#     "Rad2016K_Th.tif": "rad2016kth",
-#     "Thorium_2016.tif": "th_2016",
-#     "Mesozoic_older_raster_MEAN.tif": "meso_mean",
-#     "LOC_distance_to_coast.tif": "loc_dis",
-#     "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb",
-#     "water-85m_1.tif": "water_85m",
-#     "clim_RSM_albers.tif": "clim_rsm",
-#     "tpi_300.tif": "tpi_300",
-#     "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir",
-#     "si_geol1.tif": "si_geol1",
-#     "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay",
-#     "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr",
-#     "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl",
-#     "Gravity_land.tif": "gravity",
-#     "dem_fill.tif": "dem",
-#     "Clim_Prescott_LindaGregory.tif": "clim_linda",
-#     "slope_fill2.tif": "slopefill2",
-#     "clim_PTA_albers.tif": "clim_alber",
-#     "SagaWET9cell_M.tif": "sagawet",
-#     "ceno_euc_aust1.tif": "ceno_euc",
-#     "s2-dpca-85m_band1.tif": "s2_band1",
-#     "s2-dpca-85m_band2.tif": "s2_band2",
-#     "s2-dpca-85m_band3.tif": "s2_band3",
-#     "3dem_mag0_finn.tif": "3dem_mag0",
-#     "3dem_mag1_fin.tif": "3dem_mag1",
-#     "3dem_mag2.tif": "3dem_mag2",
-# }
-
-# geotifs = {
-#     "relief_apsect.tif": "relief4",
-#     "LATITUDE_GRID1.tif": "latitude",
-#     "LONGITUDE_GRID1.tif": "longitude",
-#     "er_depg.tif": "er_depg",
-#     "sagawet_b_sir.tif": "sagawet",
-#     "dem_foc2.tif": "dem_foc2",
-#     "outcrop_dis2.tif": "outcrop",
-#     "k_15v5.tif": "k_15v5",
-# }
-
-
-def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False, intersect_parallel: bool = False):
+def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False,
+                             intersect_parallel: bool = False):
     print("====================================\n", f"intersecting {shp.as_posix()}")
     pts = gpd.read_file(shp)
+    for g in geom_cols:
+        if g in pts.columns:
+            pts = pts.drop(g)
     coords = np.array([(p.x, p.y) for p in pts.geometry])
     geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
     pts = pts.merge(geom, left_index=True, right_index=True)
@@ -124,7 +73,6 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         #
         pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count'))
         pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean()
-        # import IPython; IPython.embed(); import sys; sys.exit()
         pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
 
         pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'],
@@ -139,6 +87,7 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
         # pts_deduped[v] = __extract_raster_points(coords_deduped, k)
     if intersect_parallel:
+        print("=====================Intersecting parallelly =======================")
         pts_deduped__ = Parallel(
                 n_jobs=-1,
                 verbose=100,
@@ -149,14 +98,14 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
     else:
         for i, (k, v) in enumerate(geotifs.items()):
             print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
-            pts_deduped[v] = __extract_raster_points(coords_deduped, k)
+            pts_deduped[v] = __extract_raster_points(k, coords_deduped)
 
     # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
     # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False)
     return pts_deduped
 
 
-def __extract_raster_points(raster, coords_deduped):
+def __extract_raster_points(raster: str, coords_deduped: np.ndarray):
     with rasterio.open(Path(raster)) as src:
         print(f"---- intersecting {raster}--------")
         return [x[0] for x in src.sample(coords_deduped)]
@@ -169,8 +118,10 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool =
     output_dir.joinpath('clean')
     out_shp = output_dir.joinpath(shp.name)
     df3 = df2[list(geotifs.values())]
-    df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
-    df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
+    finite_idices = ((np.isfinite(df3)).sum(axis=1) == len(geotifs)) & \
+                    ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))
+    df4 = df2.loc[finite_idices, :]
+    df5 = df2.loc[~finite_idices, :]
     try:
         if df5.shape[0]:
             df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
@@ -190,7 +141,7 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool =
     # local
     output_dir = Path('out_resampled')
     shapefile_location = Path("configs/data")
-    downscale_factor = 2  # keep 1 point in a 2x2 cell
+    downscale_factor = 1  # keep 1 point in a 2x2 cell
 
     geom_cols = ['POINT_X', 'POINT_Y']
 
@@ -215,8 +166,8 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool =
         print(f"\"{Path(k).name}\": \"{v}\"")
 
     print('='*100)
-    for s in shapefile_location.glob("*.shp"):
-        intersect_sample_and_clean(s, True, True)
+    for s in shapefile_location.glob("geochem_sites.shp"):
+        intersect_sample_and_clean(s, dedupe=True, write_dropped=True, intersect_parallel=True)
     # rets = Parallel(
     #         n_jobs=-1,
     #         verbose=100,

From c32ddaeb6912ab2e32cf300fc3c941ac8e6679af Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Mon, 4 Jul 2022 12:51:48 +1000
Subject: [PATCH 12/33] [predict] try once without cleaning for all algos

---
 uncoverml/predict.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/uncoverml/predict.py b/uncoverml/predict.py
index f5abb696..512ecdd4 100644
--- a/uncoverml/predict.py
+++ b/uncoverml/predict.py
@@ -128,8 +128,7 @@ def _fix_for_corrupt_data(x, feature_names):
         x.data[min_mask] = float32finfo.min
         x.data[max_mask] = float32finfo.max
         x.mask += min_mask + max_mask
-        problem_feature_names = \
-            list(compress(feature_names, ~isfinite.all(axis=0)))
+        problem_feature_names = list(compress(feature_names, ~isfinite.all(axis=0)))
 
         print("Warning: Float64 data was truncated to float32 max/min values."
               "Check your covariates for possible nodatavalue, datatype, "
@@ -160,15 +159,6 @@ def _get_data(subchunk, config):
     log.info("Applying feature transforms")
     x = features.transform_features(extracted_chunk_sets, transform_sets, config.final_transform, config)[0]
 
-    # only check/correct float32 conversion for Ensemble models
-    if config.pca:
-        x = _fix_for_corrupt_data(x, features_names)
-        return _mask_rows(x, subchunk, config), features_names
-
-    if not config.clustering:
-        if (isinstance(modelmaps[config.algorithm](), BaseEnsemble) or
-                config.multirandomforest):
-            x = _fix_for_corrupt_data(x, features_names)
     return _mask_rows(x, subchunk, config), features_names
 
 
@@ -213,8 +203,12 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf
     log.info("Loaded {:2.4f}GB of image data".format(total_gb))
     alg = config.algorithm
     log.info("Predicting targets for {}.".format(alg))
-    y_star = predict(x, model, interval=config.quantiles,
-                     lon_lat=_get_lon_lat(subchunk, config))
+    try:
+        y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config))
+    except ValueError as v:
+        log.warning(v)
+        x = _fix_for_corrupt_data(x, features_names)
+        y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config))
     if config.cluster and config.cluster_analysis:
         cluster_analysis(x, y_star, subchunk, config, feature_names)
     # cluster_analysis(x, y_star, subchunk, config, feature_names)

From 00d41baea16680afa751a3a1daa6174e79ea66fe Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Sat, 23 Jul 2022 05:55:11 +1000
Subject: [PATCH 13/33] update some scripts

---
 configs/demo_regression.yaml           |  1 -
 configs/group_and_split_shapefile.yaml |  0
 configs/reference_gb.yaml              | 42 ++++++++---------------
 configs/reference_xgboost.yaml         | 28 ++++++++--------
 configs/resampling.yaml                |  4 +--
 scripts/check_covariates.py            | 46 ++++++++++++++++++++++++++
 scripts/interpolate_shapes.py          | 31 +++++++++--------
 scripts/intersect_rasters.py           |  3 +-
 8 files changed, 96 insertions(+), 59 deletions(-)
 mode change 100755 => 100644 configs/group_and_split_shapefile.yaml
 mode change 100755 => 100644 configs/resampling.yaml
 create mode 100644 scripts/check_covariates.py

diff --git a/configs/demo_regression.yaml b/configs/demo_regression.yaml
index eda84ce0..c120e7aa 100644
--- a/configs/demo_regression.yaml
+++ b/configs/demo_regression.yaml
@@ -33,7 +33,6 @@ mask:
 # imputation options are gaus, nn, mean
 preprocessing:
   imputation: mean
-
   transforms:
     # - whiten:
         # keep_fraction: 0.8
diff --git a/configs/group_and_split_shapefile.yaml b/configs/group_and_split_shapefile.yaml
old mode 100755
new mode 100644
diff --git a/configs/reference_gb.yaml b/configs/reference_gb.yaml
index 2c1717a8..c5f7f787 100644
--- a/configs/reference_gb.yaml
+++ b/configs/reference_gb.yaml
@@ -58,17 +58,17 @@ learning:
   algorithm: gradientboost
   arguments:
     target_transform: identity
-    loss: 'ls'
-    max_depth: 20
-    learning_rate: 0.1
-    n_estimators: 100
-    subsample: 0.9
-    min_samples_split: 2
-    min_samples_leaf: 2
-    min_weight_fraction_leaf: 0.0
-    max_features: "auto"
-    alpha: 0.95
-    random_state: 3
+#    loss: 'ls'
+#    max_depth: 20
+#    learning_rate: 0.1
+    n_estimators: 200
+#    subsample: 0.9
+#    min_samples_split: 2
+#    min_samples_leaf: 2
+#    min_weight_fraction_leaf: 0.0
+#    max_features: "auto"
+#    alpha: 0.95
+#    random_state: 3
   optimisation:
     hyperopt_params:
       max_evals: 5
@@ -89,22 +89,6 @@ learning:
       min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
       max_leaf_nodes: randint('max_leaf_nodes', 10, 50)
 
-    searchcv_params:
-      n_iter: 6
-      cv: 2
-      verbose: 1000
-      n_points: 3
-      n_jobs: 6
-    params_space:
-      'max_depth': Integer(1, 15)
-      'learning_rate': Real(10 ** -5, 10 ** 0, prior="log-uniform")
-      'n_estimators': Integer(10, 100)
-      'subsample': Real(0.01, 1.0, prior='uniform')
-      'max_features': Categorical(['auto', 'sqrt', 'log2'])
-      'min_samples_split': Integer(2, 50)
-      'min_samples_leaf': Integer(1, 50)
-      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
-      'max_leaf_nodes': Integer(10, 50)
 
 prediction:
 #  corner_coordinates:
@@ -114,10 +98,10 @@ prediction:
   outbands: 1
 
 validation:
-  #- feature_rank
+  - feature_rank
   - parallel
   - k-fold:
-      folds: 3
+      folds: 5
       random_seed: 1
 
 output:
diff --git a/configs/reference_xgboost.yaml b/configs/reference_xgboost.yaml
index dcd762c7..7101cd73 100644
--- a/configs/reference_xgboost.yaml
+++ b/configs/reference_xgboost.yaml
@@ -1,26 +1,29 @@
-
 experiment: my_run
 patchsize: 0
 memory_fraction: 0.5
 
 features:
-  - name: my continuous features
+  - name: my continuous features 2
     type: continuous
     files:
-      - path: configs/data/sirsam/er_depg.tif
-      - path: configs/data/sirsam/sagawet_b_sir.tif
       - path: configs/data/sirsam/dem_foc2.tif
       - path: configs/data/sirsam/outcrop_dis2.tif
-#      - path: configs/data/sirsam/k_15v5.tif
       - path: configs/data/sirsam/relief_apsect.tif
-#      - path: configs/data/sirsam/LATITUDE_GRID1.tif
-#      - path: configs/data/sirsam/LONGITUDE_GRID1.tif
-#      - directory: configs/data/weights/
     transforms:
       - identity
     #      - whiten:
     #          keep_fraction: 0.98
-    imputation: none
+    imputation: mean
+  - name: my continuous features 1
+    type: continuous
+    files:
+      - path: configs/data/sirsam/er_depg.tif
+      - path: configs/data/sirsam/sagawet_b_sir.tif
+    transforms:
+      - identity
+    #      - whiten:
+    #          keep_fraction: 0.98
+    imputation: gauss
 
 preprocessing:
   imputation: none
@@ -125,7 +128,7 @@ prediction:
   outbands: 1
 
 validation:
-  #- feature_rank
+#  - feature_rank
   - parallel
 #  - permutation_importance
 #  - feature_importance
@@ -134,9 +137,8 @@ validation:
       random_seed: 1
 
 oos_validation:
-  file: configs/data/weights/Gamma_K_50.shp
-  property: K2O
+  file: configs/data/geochem_sites.shp
+  property: K_ppm_imp
 
 output:
   directory: ref_xgb/
-
diff --git a/configs/resampling.yaml b/configs/resampling.yaml
old mode 100755
new mode 100644
index 791831b0..c84cfc73
--- a/configs/resampling.yaml
+++ b/configs/resampling.yaml
@@ -2,8 +2,8 @@
 #  when undersample == true, we choose points with or without bootstrap
 
 targets:
-  file: configs/data/weights/Gamma_K_50.shp
-  property: K2O
+  file: configs/data/sirsam/out_resampled/geochem_sites.shp
+  property: K_ppm_imp
   resample:
     spatial:
       undersample: false
diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py
new file mode 100644
index 00000000..5aa67766
--- /dev/null
+++ b/scripts/check_covariates.py
@@ -0,0 +1,46 @@
+from joblib import delayed, Parallel
+from pathlib import Path
+import rasterio as rio
+import numpy as np
+import pandas as pd
+
+dir = "configs/data/sirsam"
+
+mask = Path(dir).joinpath("dem_foc2.tif")
+
+with rio.open(mask) as geotif:
+    mask_raster = geotif.read(1, masked=True)
+
+
+def _parallel_read(r):
+    try:
+        with rio.open(r) as geotif:
+            raster = geotif.read(1, masked=True)
+            m = geotif.meta
+            m['crs'] = m['crs'].to_string()
+            t = m.pop('transform')
+            m['pixsize_x'] = t[0]
+            m['top_left_x'] = t[2]
+            m['pixsize_y'] = -t[4]
+            m['top_left_y'] = t[5]
+            raster.mask = raster.mask | mask_raster.mask
+            m['all_finite'] = np.all(np.isfinite(raster))
+            m['any_nan'] = np.any(np.isnan(raster))
+            m['any_large'] = np.any(np.abs(raster) > 1e10)
+            # raster_attrs[r.stem] = m
+        return m
+    except Exception as e:
+        print(r)
+        print(e)
+        return [None] * 14
+
+
+rets = Parallel(
+    n_jobs=-1,
+    verbose=100,
+)(delayed(_parallel_read)(r) for r in Path(dir).glob("*.tif"))
+
+raster_attrs = {r.stem: v  for r, v in zip(Path(dir).glob("*.tif"), rets)}
+
+df = pd.DataFrame.from_dict(raster_attrs)
+df.to_csv("quality.csv")
diff --git a/scripts/interpolate_shapes.py b/scripts/interpolate_shapes.py
index 83b34e8f..5140cb59 100644
--- a/scripts/interpolate_shapes.py
+++ b/scripts/interpolate_shapes.py
@@ -34,21 +34,26 @@ def interpolate_shape(shp):
 
 if __name__ == '__main__':
 
-    ref_df = gpd.read_file("./NT_1_albers.shp", rows=1)
+    ref_df = gpd.read_file("/g/data/ge3/aem_sections/AEM_covariates/NT_1_albers.shp", rows=1)
 
     shapes_to_convert = [
-        './AEM_GAB_albers.shp',
-        './Albany_1_albers_cleaned.shp',
-        './Albany_2_albers_mod_cleaned.shp',
-        './Murchison_1_albers_cleaned.shp',
-        './Murchison_2_albers_cleaned.shp',
-        './Murchison_3_albers.shp',
-        './frome_1_albers.shp',
-        './frome_2_albers.shp',
-        './frome_3_albers.shp',
-        './frome_4_albers.shp',
-        './frome_5_albers.shp',
-        './frome_6_albers.shp'
+        # "/g/data/ge3/data/AEM_shapes/AEM_ERC_ALL_interp_data_Albers_Ceno.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_1_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_1_2_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_1_3_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers_mod.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers_mod.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_3_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_2_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_3_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_4_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_5_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_6_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/Mundi_Mundi_albers.shp",
     ]
     output_dir = Path('interpolated')
     output_dir.mkdir(exist_ok=True, parents=True)
diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index a33529b7..ea50d789 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -133,7 +133,8 @@ def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool =
             print(f"saved intersected shapefile at {out_shp.as_posix()}")
             print(f"No points dropped and there for _cleaned.shp file is not createed'")
             print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
-    except:
+    except Exception as e:
+        print(e)
         print(f"Check this shapefile {shp}")
 
 

From 4129050e7fb7bb105778415160b90ba2cd487b0b Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Sat, 27 Aug 2022 06:23:58 +1000
Subject: [PATCH 14/33] dedupe shape script added

---
 scripts/check_covariates.py |  4 +--
 scripts/dedupe_shape.py     | 70 +++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 scripts/dedupe_shape.py

diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py
index 5aa67766..24970320 100644
--- a/scripts/check_covariates.py
+++ b/scripts/check_covariates.py
@@ -23,7 +23,7 @@ def _parallel_read(r):
             m['top_left_x'] = t[2]
             m['pixsize_y'] = -t[4]
             m['top_left_y'] = t[5]
-            raster.mask = raster.mask | mask_raster.mask
+            raster.mask = raster.mask | mask_raster.mask  # we are not interested in masked areas
             m['all_finite'] = np.all(np.isfinite(raster))
             m['any_nan'] = np.any(np.isnan(raster))
             m['any_large'] = np.any(np.abs(raster) > 1e10)
@@ -40,7 +40,7 @@ def _parallel_read(r):
     verbose=100,
 )(delayed(_parallel_read)(r) for r in Path(dir).glob("*.tif"))
 
-raster_attrs = {r.stem: v  for r, v in zip(Path(dir).glob("*.tif"), rets)}
+raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("*.tif"), rets)}
 
 df = pd.DataFrame.from_dict(raster_attrs)
 df.to_csv("quality.csv")
diff --git a/scripts/dedupe_shape.py b/scripts/dedupe_shape.py
new file mode 100644
index 00000000..42c2b440
--- /dev/null
+++ b/scripts/dedupe_shape.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+from joblib import Parallel, delayed
+import numpy as np
+import pandas as pd
+import rasterio
+import geopandas as gpd
+
+
+def dedupe_raster(shp: Path, tif: Path, deduped_shp: Path):
+    """
+    :param shp: input shapefile with dense points
+    :param tif: sample tif to read resolution details
+    :param deduped_shp: output shapefile with one point per down-sampled raster resolution
+    :return:
+    """
+    print("====================================\n", f"deduping {shp.as_posix()}")
+    geom_cols = ['POINT_X', 'POINT_Y']
+    pts = gpd.read_file(shp)
+    for g in geom_cols:
+        if g in pts.columns:
+            pts = pts.drop(g, axis=1)
+    coords = np.array([(p.x, p.y) for p in pts.geometry])
+    geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+    pts = pts.merge(geom, left_index=True, right_index=True)
+
+    with rasterio.open(tif) as src:
+        # resample data to target shape
+        data = src.read(
+            out_shape=(
+                src.count,
+                int(src.height / downscale_factor),
+                int(src.width / downscale_factor)
+            ),
+            resampling=rasterio.enums.Resampling.bilinear
+        )
+        # scale image transform
+        transform = src.transform * src.transform.scale(
+            (src.width / data.shape[-1]),
+            (src.height / data.shape[-2])
+        )
+        pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
+
+    pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count'))
+    pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean()
+    pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
+
+    pts_deduped = gpd.GeoDataFrame(pts_deduped,
+                                   geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y']),
+                                   crs="EPSG:3577"   # Australian Albers
+                                   )
+    pts_deduped.to_file(deduped_shp.as_posix())
+    return pts_deduped
+
+
+if __name__ == '__main__':
+    shapefiles = Path("configs/data/")
+    downscale_factor = 6  # keep 1 point in a 6x6 cell
+
+    dem = Path('/home/my_dem.tif')
+    output_dir = Path('1in6')
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    # for s in shapefiles.glob("*.shp"):
+    #     deduped_shp = output_dir.joinpath(s.name)
+    #     dedupe_raster(shp=s, tif=dem, deduped_shp=deduped_shp)
+
+    Parallel(
+            n_jobs=-1,
+            verbose=100,
+        )(delayed(dedupe_raster)(s, dem, output_dir.joinpath(s.name)) for s in shapefiles.glob("geochem_sites.shp"))

From 508576d13178c447dec96b27f4007774895bdd20 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Sat, 3 Sep 2022 17:53:02 +1000
Subject: [PATCH 15/33] add more statistics in covariate check

---
 scripts/check_covariates.py | 14 ++++++++++++--
 setup.py                    |  2 +-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py
index 24970320..3105f0ab 100644
--- a/scripts/check_covariates.py
+++ b/scripts/check_covariates.py
@@ -1,6 +1,8 @@
+import subprocess
 from joblib import delayed, Parallel
 from pathlib import Path
 import rasterio as rio
+from rasterio.io import DatasetReader
 import numpy as np
 import pandas as pd
 
@@ -12,10 +14,11 @@
     mask_raster = geotif.read(1, masked=True)
 
 
-def _parallel_read(r):
+
+def _parallel_read(r: Path):
     try:
         with rio.open(r) as geotif:
-            raster = geotif.read(1, masked=True)
+            raster: DatasetReader = geotif.read(1, masked=True)
             m = geotif.meta
             m['crs'] = m['crs'].to_string()
             t = m.pop('transform')
@@ -27,6 +30,13 @@ def _parallel_read(r):
             m['all_finite'] = np.all(np.isfinite(raster))
             m['any_nan'] = np.any(np.isnan(raster))
             m['any_large'] = np.any(np.abs(raster) > 1e10)
+            m['min'] = np.ma.min(raster)
+            m['max'] = np.ma.max(raster)
+            m['std'] = np.ma.std(raster)
+            m['valid_percent'] = 100 - np.sum(raster.mask) / raster.size * 100
+            print("=" * 50)
+            print(r, m['valid_percent'])
+            subprocess.run(f"gdalinfo {r.as_posix()} -stats | grep -i valid_", shell=True)
             # raster_attrs[r.stem] = m
         return m
     except Exception as e:
diff --git a/setup.py b/setup.py
index 068891c6..b14d4aba 100644
--- a/setup.py
+++ b/setup.py
@@ -92,7 +92,7 @@ def run(self):
         'revrand >= 0.9.10',
         'mpi4py==3.1.3',
         'scipy==1.6.2',
-        'scikit-learn~=0.22.2',
+        'scikit-learn==0.22.2.post1',
         'scikit-image==0.19.1',
         'scikit-optimize == 0.8.1',
         'wheel >= 0.29.0',

From 78bbc68a27346bb93d7d737f34c118beec890cb1 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Wed, 9 Nov 2022 07:42:17 +1100
Subject: [PATCH 16/33] ref rf with hpopt

---
 configs/ref_rf.yaml | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml
index abdb678f..2f3d8c89 100644
--- a/configs/ref_rf.yaml
+++ b/configs/ref_rf.yaml
@@ -39,19 +39,35 @@ learning:
     random_state: 1
     max_depth: 20
   optimisation:
-    searchcv_params:
-      n_iter: 6
+    hyperopt_params:
+      max_evals: 5
+      step: 2
       cv: 2
-      verbose: 1000
-      n_points: 3
-      n_jobs: 6
-    params_space:
-      'max_depth': Integer(1, 15)
-      'n_estimators': Integer(10, 100)
-      'max_features': Categorical(['auto', 'sqrt', 'log2'])
-      'min_samples_split': Integer(2, 50)
-      'min_samples_leaf': Integer(1, 50)
-      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
+      verbose: true
+      random_state: 3
+      scoring: r2  # r2, neg_mean_absolute_error, etc..see note above
+      algo: bayes   # bayes, or anneal
+    hp_params_space:
+      max_depth: randint('max_depth', 1, 15)
+      n_estimators: randint('n_estimators', 5, 25)
+      max_features: choice('max_features', ['auto', 'sqrt', 'log2'])
+      min_samples_split: randint('min_samples_split', 2, 50)
+      min_samples_leaf: randint('min_samples_leaf', 1, 50)
+      min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
+      max_leaf_nodes: randint('max_leaf_nodes', 10, 50)
+#    searchcv_params:
+#      n_iter: 6
+#      cv: 2
+#      verbose: 1000
+#      n_points: 3
+#      n_jobs: 6
+#    params_space:
+#      'max_depth': Integer(1, 15)
+#      'n_estimators': Integer(10, 100)
+#      'max_features': Categorical(['auto', 'sqrt', 'log2'])
+#      'min_samples_split': Integer(2, 50)
+#      'min_samples_leaf': Integer(1, 50)
+#      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
 
 
 prediction:

From 2284f8396fbf5340158ce50caf8bd90c07d0d074 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Wed, 9 Nov 2022 07:43:08 +1100
Subject: [PATCH 17/33] discard more data during optmisation if they are not
 finite

---
 uncoverml/validate.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/uncoverml/validate.py b/uncoverml/validate.py
index cc2cf341..5239cff9 100644
--- a/uncoverml/validate.py
+++ b/uncoverml/validate.py
@@ -372,14 +372,16 @@ def setup_validation_data(X, targets_all, cv_folds, random_state=None):
                                        random_state=random_state)
 
     rows_with_at_least_one_masked = ~ np.any(X.mask, axis=1)
-    X = X[rows_with_at_least_one_masked, :]
-    w = w[rows_with_at_least_one_masked]
-    y = y[rows_with_at_least_one_masked]
-    lon_lat = lon_lat[rows_with_at_least_one_masked, :]
-    groups = groups[rows_with_at_least_one_masked]
+    finite_X = np.isfinite(X.data).sum(axis=1) == X.shape[1]
+    valid_rows = rows_with_at_least_one_masked & finite_X
+    X = X[valid_rows, :]
+    w = w[valid_rows]
+    y = y[valid_rows]
+    lon_lat = lon_lat[valid_rows, :]
+    groups = groups[valid_rows]
 
     for (f, v), a in zip(targets_all.fields.items(), arrays):
-        targets_all.fields[f] = a[rows_with_at_least_one_masked]
+        targets_all.fields[f] = a[valid_rows]
 
     if len(np.unique(groups)) >= cv_folds:
         log.info(f'Using GroupKFold with {cv_folds} folds')

From e9ed58318c8b0fbe49fe0203294842b292d6e37b Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Fri, 2 Dec 2022 17:24:16 +1100
Subject: [PATCH 18/33] update util scripts

---
 scripts/check_covariates.py            | 19 +++++---
 scripts/intersect_raster_with_shape.py | 62 ++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 7 deletions(-)
 create mode 100644 scripts/intersect_raster_with_shape.py

diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py
index 3105f0ab..117f556c 100644
--- a/scripts/check_covariates.py
+++ b/scripts/check_covariates.py
@@ -27,16 +27,17 @@ def _parallel_read(r: Path):
             m['pixsize_y'] = -t[4]
             m['top_left_y'] = t[5]
             raster.mask = raster.mask | mask_raster.mask  # we are not interested in masked areas
+            # print(raster)
             m['all_finite'] = np.all(np.isfinite(raster))
             m['any_nan'] = np.any(np.isnan(raster))
             m['any_large'] = np.any(np.abs(raster) > 1e10)
             m['min'] = np.ma.min(raster)
+            m['mean'] = np.ma.mean(raster)
+            m['median'] = np.ma.median(raster)
             m['max'] = np.ma.max(raster)
             m['std'] = np.ma.std(raster)
-            m['valid_percent'] = 100 - np.sum(raster.mask) / raster.size * 100
-            print("=" * 50)
-            print(r, m['valid_percent'])
-            subprocess.run(f"gdalinfo {r.as_posix()} -stats | grep -i valid_", shell=True)
+            m['skew'] = 3 * (m['mean'] - m['median']) / m['std']
+            # subprocess.run(f"gdalinfo {r.as_posix()} -stats", shell=True, capture_output=True)
             # raster_attrs[r.stem] = m
         return m
     except Exception as e:
@@ -46,11 +47,15 @@ def _parallel_read(r: Path):
 
 
 rets = Parallel(
-    n_jobs=-1,
+    n_jobs=1,
     verbose=100,
-)(delayed(_parallel_read)(r) for r in Path(dir).glob("*.tif"))
+)(delayed(_parallel_read)(r) for r in Path(dir).glob("**/*.tif"))
 
-raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("*.tif"), rets)}
+import pickle
+with open("rets.pk", 'wb') as f:
+    pickle.dump(rets, f)
+
+raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("**/*.tif",), rets)}
 
 df = pd.DataFrame.from_dict(raster_attrs)
 df.to_csv("quality.csv")
diff --git a/scripts/intersect_raster_with_shape.py b/scripts/intersect_raster_with_shape.py
new file mode 100644
index 00000000..b25a21f9
--- /dev/null
+++ b/scripts/intersect_raster_with_shape.py
@@ -0,0 +1,62 @@
+# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318_std.tif
+# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif
+
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import rasterio as rio
+
+
+# with rasterio.open(tif) as src:
+#     # resample data to target shape
+#     data = src.read(
+#         out_shape=(
+#             src.count,
+#             int(src.height / downscale_factor),
+#             int(src.width / downscale_factor)
+#         ),
+#         resampling=rasterio.enums.Resampling.bilinear
+#     )
+#     # scale image transform
+#     transform = src.transform * src.transform.scale(
+#         (src.width / data.shape[-1]),
+#         (src.height / data.shape[-2])
+#     )
+#     pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
+
+
+def raster_points() -> gpd.GeoDataFrame:
+    with rio.Env():
+        with rio.open(
+                '/g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif') as src:
+            crs = src.crs
+            # create 1D coordinate arrays (coordinates of the pixel center)
+            xmin, ymax = np.around(src.xy(0.00, 0.00), 9)  # src.xy(0, 0)
+            xmax, ymin = np.around(src.xy(src.height - 1, src.width - 1), 9)  # src.xy(src.width-1, src.height-1)
+            x = np.linspace(xmin, xmax, src.width)
+            y = np.linspace(ymax, ymin, src.height)  # max -> min so coords are top -> bottom
+
+            # create 2D arrays
+            xs, ys = np.meshgrid(x, y)
+            zs = src.read(1)
+
+            # Apply NoData mask
+            mask = src.read_masks(1) > 0
+            xs, ys, zs = xs[mask], ys[mask], zs[mask]
+    data = {"X": pd.Series(xs.ravel()),
+            "Y": pd.Series(ys.ravel()),
+            "Z": pd.Series(zs.ravel())}
+    df = pd.DataFrame(data=data)
+    geometry = gpd.points_from_xy(df.X, df.Y)
+    gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
+    return gdf
+
+
+raster_gdf = raster_points()
+
+shape_file = ""
+shp_gdf = gpd.read_file(shape_file)
+gpd.overlay(raster_gdf, shp_gdf, how="intersection")
+print(raster_gdf.head())
+

From c4e2a684865239a74d418b3ab76e3b621b111a57 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Fri, 2 Dec 2022 17:24:58 +1100
Subject: [PATCH 19/33] minor changes in config and retained data during
 training

---
 uncoverml/config.py   |  2 +-
 uncoverml/features.py | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/uncoverml/config.py b/uncoverml/config.py
index ed1dee90..bff5c28c 100644
--- a/uncoverml/config.py
+++ b/uncoverml/config.py
@@ -286,6 +286,7 @@ def __init__(self, yaml_file):
                 self.split_oos_fraction = s['targets']['split']['oos_fraction']
                 self.train_shapefile = Path(self.output_dir).joinpath(s['output']['train_shapefile'])
                 self.oos_shapefile = Path(self.output_dir).joinpath(s['output']['oos_shapefile'])
+            self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp')
 
         self.mask = None
         if 'mask' in s:
@@ -380,7 +381,6 @@ def __init__(self, yaml_file):
             else self.name + ('.cluster' if self.clustering else '.model')
 
         self.model_file = Path(self.output_dir).joinpath(output_model)
-        self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp')
         self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv')
         self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv')
         self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json")
diff --git a/uncoverml/features.py b/uncoverml/features.py
index bcbe4880..3d7fcebc 100644
--- a/uncoverml/features.py
+++ b/uncoverml/features.py
@@ -170,9 +170,15 @@ def cull_all_null_rows(feature_sets):
 
     bool_transformed_vectors = np.concatenate([t.mask for t in
                                                transformed_vectors], axis=1)
-    covaraiates = bool_transformed_vectors.shape[1]
-    rows_to_keep = np.sum(bool_transformed_vectors, axis=1) != covaraiates
-    return rows_to_keep
+    data_transformed_vectors = np.concatenate([t.data for t in
+                                               transformed_vectors], axis=1)
+    num_covariates = bool_transformed_vectors.shape[1]
+
+    rows_with_at_least_one_cov_unmasked = np.sum(bool_transformed_vectors, axis=1) != num_covariates
+    # good rows are any covariate unmasked and all finite covariates
+    rows_with_all_finite_covariates = np.isfinite(data_transformed_vectors).sum(axis=1) == num_covariates
+    good_rows = rows_with_all_finite_covariates &  rows_with_at_least_one_cov_unmasked
+    return good_rows
 
 
 def gather_features(x, node=None):

From 69951f7f7455133e2a296467e8f43d6c394a11cc Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Tue, 6 Dec 2022 00:17:24 +1100
Subject: [PATCH 20/33] pcas implemented

---
 configs/reference_pca.yaml     |  3 ++-
 uncoverml/config.py            |  2 ++
 uncoverml/predict.py           |  9 ++++++---
 uncoverml/scripts/uncoverml.py | 27 ++++++++++++++++++++++-----
 uncoverml/transforms/linear.py |  2 +-
 5 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml
index aea2c3c9..8abeb833 100644
--- a/configs/reference_pca.yaml
+++ b/configs/reference_pca.yaml
@@ -18,7 +18,8 @@ features:
 preprocessing:
   transforms:
     - whiten:
-        n_components: 4
+        n_components: 2
+#        keep_fraction: 0.2  # one of keep fraction or n_components allowed
 
 pca:
   geotif:
diff --git a/uncoverml/config.py b/uncoverml/config.py
index bff5c28c..c226f810 100644
--- a/uncoverml/config.py
+++ b/uncoverml/config.py
@@ -298,6 +298,8 @@ def __init__(self, yaml_file):
             preprocessing_transforms = s['preprocessing']['transforms']
             if 'n_components' in preprocessing_transforms[0]['whiten']:
                 self.n_components = preprocessing_transforms[0]['whiten']['n_components']
+            elif 'keep_fraction' in preprocessing_transforms[0]['whiten']:
+                self.keep_fraction = preprocessing_transforms[0]['whiten']['keep_fraction']
             else:
                 self.n_components = None
             if 'geotif' not in s['pca']:
diff --git a/uncoverml/predict.py b/uncoverml/predict.py
index 512ecdd4..f8ba2e71 100644
--- a/uncoverml/predict.py
+++ b/uncoverml/predict.py
@@ -13,6 +13,7 @@
 from uncoverml.krige import krig_dict
 from uncoverml import transforms
 from uncoverml.config import Config
+from uncoverml.hyopt import NpEncoder
 
 log = logging.getLogger(__name__)
 float32finfo = np.finfo(dtype=np.float32)
@@ -217,17 +218,19 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf
 
 def export_pca(subchunk, image_out: geoio.ImageWriter, config: Config):
     x, _ = _get_data(subchunk, config)
-    mpiops.run_once(export_pca_fractions, config)
     total_gb = mpiops.comm.allreduce(x.nbytes / 1e9)
     log.info("Loaded {:2.4f}GB of image data".format(total_gb))
     log.info("Extracting PCAs....")
+    # reverse x along columns which are eigen vectors so that the eigenvector corresponding to the largest eigenvalue
+    # is the first PC. This will hopefully remove some ambiguity
     image_out.write(np.flip(x, axis=1), subchunk)
 
 
-def export_pca_fractions(config):
+def export_pca_fractions(config: Config):
     whiten_transform = config.final_transform.global_transforms[0]
     with open(config.pca_json, 'w') as output_file:
-        json.dump(whiten_transform.explained_ratio, output_file, sort_keys=True, indent=4)
+        json.dump(whiten_transform.__dict__, output_file, sort_keys=True, indent=4, cls=NpEncoder)
+    log.info(f"wrote pc contributions/whiten transform stats in {config.pca_json}")
 
 
 def cluster_analysis(x, y, partition_no, config, feature_names):
diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index f40c1352..27939d39 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -390,14 +390,20 @@ def predict(model_or_cluster_file, partitions, mask, retain):
 @click.argument('pipeline_file')
 @click.option('-p', '--partitions', type=int, default=1,
               help='divide each node\'s data into this many partitions')
+@click.option('-s', '--subsample_fraction', type=float, default=1.0,
+              help='only use this fraction of the data for learning classes')
 @click.option('-m', '--mask', type=str, default='',
               help='mask file used to limit prediction area')
 @click.option('-r', '--retain', type=int, default=None,
               help='mask values where to predict')
-def pca(pipeline_file, partitions, mask, retain):
+def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
     config = ls.config.Config(pipeline_file)
+
+    __validate_pca_config(config)  # no other transforms other than whiten
     assert config.pca, "Not a pca analysis. Please include the pca block in your yaml!!"
     config.mask = mask if mask else config.mask
+
+    config.subsample_fraction = subsample_fraction
     if config.mask:
         config.retain = retain if retain else config.retain
 
@@ -413,8 +419,20 @@ def pca(pipeline_file, partitions, mask, retain):
     else:
         log.info("Using memory aggressively: dividing all data between nodes")
 
-    image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(config.n_components, config)
-    tr_whiten = __validate_pca_config(config)
+    # Get the image chunks and their associated transforms
+    image_chunk_sets = ls.geoio.unsupervised_feature_sets(config)
+    transform_sets = [k.transform_set for k in config.feature_sets]
+
+    _ = ls.features.transform_features(image_chunk_sets,
+                                         transform_sets,
+                                         config.final_transform,
+                                         config)
+    log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data")
+    ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config)
+    ls.mpiops.run_once(ls.predict.export_pca_fractions, config)
+
+    whiten_transform = config.final_transform.global_transforms[0]
+    image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(whiten_transform.keepdims, config)
     outfile_tif = config.name + "_pca"
 
     image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs,
@@ -422,10 +440,8 @@ def pca(pipeline_file, partitions, mask, retain):
                                      config.n_subchunks, config.output_dir,
                                      band_tags=[f'_pc_{n}' for n in range(1, config.n_components+1)],
                                      **config.geotif_options)
-
     for i in range(config.n_subchunks):
         log.info("starting to render partition {}".format(i+1))
-        # TODO: ideally want to take a random sample of each covariate and compute whiten stats
         ls.predict.export_pca(i, image_out, config)
 
     # explicitly close output rasters
@@ -435,6 +451,7 @@ def pca(pipeline_file, partitions, mask, retain):
 
 
 def __validate_pca_config(config):
+    # assert no other transforms other than whiten
     transform_sets = [k.transform_set for k in config.feature_sets]
     for t in transform_sets:
         assert len(t.image_transforms) == 0  # validation that there are no image or global transforms
diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py
index 2fc0e7b8..343bd663 100644
--- a/uncoverml/transforms/linear.py
+++ b/uncoverml/transforms/linear.py
@@ -90,7 +90,7 @@ def __init__(self, keep_fraction=None, n_components=None):
         if keep_fraction is None:
             assert n_components is not None
         assert (keep_fraction is None) or (n_components is None)
-        self.explained_ratio = None
+        self.explained_ratio = {}
 
     def __call__(self, x):
         x = x.astype(float)

From 0dfeff7e1299fdc925629ed2205a0b60b940ba86 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Tue, 6 Dec 2022 00:48:26 +1100
Subject: [PATCH 21/33] some helpful logs

---
 uncoverml/geoio.py             | 5 +++--
 uncoverml/scripts/uncoverml.py | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py
index 0850c077..03c65d95 100644
--- a/uncoverml/geoio.py
+++ b/uncoverml/geoio.py
@@ -537,8 +537,9 @@ def f(image_source):
                                        patchsize=config.patchsize)
         if frac < 1.0:
             np.random.seed(1)
-            r = r[np.random.rand(r.shape[0]) < frac]
-        return r
+            rr = r[np.random.rand(r.shape[0]) < frac]
+            log.info(f"sampled {rr.shape[0]} from max possible {r.shape[0]}")
+        return rr
     result = _iterate_sources(f, config)
     return result
 
diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index 27939d39..a748c501 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -423,10 +423,12 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
     image_chunk_sets = ls.geoio.unsupervised_feature_sets(config)
     transform_sets = [k.transform_set for k in config.feature_sets]
 
-    _ = ls.features.transform_features(image_chunk_sets,
+    features, _ = ls.features.transform_features(image_chunk_sets,
                                          transform_sets,
                                          config.final_transform,
                                          config)
+    log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of"
+             f" {features.shape[0]} points from the rasters")
     log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data")
     ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config)
     ls.mpiops.run_once(ls.predict.export_pca_fractions, config)

From bbb603bc6f2272c08c0fa85242af196abc575f33 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Tue, 6 Dec 2022 01:02:27 +1100
Subject: [PATCH 22/33] some more helpful logs

---
 uncoverml/scripts/uncoverml.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index a748c501..c7d632a8 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -427,8 +427,10 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
                                          transform_sets,
                                          config.final_transform,
                                          config)
+    print(f"process {ls.mpiops.chunk_index} has {features.shape[0]} samples")
+    num_samples = np.sum(ls.mpiops.comm.gather(features.shape[0]))
     log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of"
-             f" {features.shape[0]} points from the rasters")
+             f" {num_samples} points from the rasters")
     log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data")
     ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config)
     ls.mpiops.run_once(ls.predict.export_pca_fractions, config)

From 4a9c899c850f676335eaaffe0eda19f841dbdfd0 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Tue, 6 Dec 2022 07:27:29 +1100
Subject: [PATCH 23/33] some more useful logs

---
 uncoverml/transforms/linear.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py
index 343bd663..ebf3e4e3 100644
--- a/uncoverml/transforms/linear.py
+++ b/uncoverml/transforms/linear.py
@@ -1,7 +1,8 @@
-
+import logging
 import numpy as np
 
 from uncoverml import mpiops
+log = logging.getLogger(__name__)
 
 
 class CentreTransform:
@@ -96,6 +97,9 @@ def __call__(self, x):
         x = x.astype(float)
         if self.mean is None or self.eigvals is None or self.eigvecs is None:
             self.mean = mpiops.mean(x)
+            num_points = np.sum(mpiops.comm.gather(x.shape[0]))
+            num_covariates = x.shape[1]
+            log.info(f"Matrix for eigenvalue decomposition has shape {num_points, num_covariates}")
             self.eigvals, self.eigvecs = mpiops.eigen_decomposition(x)
             ndims = x.shape[1]
             # make sure 1 <= keepdims <= ndims

From 4bd8490f67c4535b49f0606d36d8bbac365d9df3 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Tue, 6 Dec 2022 08:10:12 +1100
Subject: [PATCH 24/33] variation fraction support implemented for pca analysis

---
 configs/reference_pca.yaml     |  5 +++--
 uncoverml/config.py            |  2 ++
 uncoverml/scripts/uncoverml.py |  2 +-
 uncoverml/transforms/linear.py | 26 +++++++++++++++-----------
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml
index 8abeb833..284b26fd 100644
--- a/configs/reference_pca.yaml
+++ b/configs/reference_pca.yaml
@@ -18,8 +18,9 @@ features:
 preprocessing:
   transforms:
     - whiten:
-        n_components: 2
-#        keep_fraction: 0.2  # one of keep fraction or n_components allowed
+#        n_components: 2
+#        keep_fraction: 0.2  # one of keep fraction or n_components or variation_fraction allowed
+        variation_fraction: 0.98
 
 pca:
   geotif:
diff --git a/uncoverml/config.py b/uncoverml/config.py
index c226f810..3cf47323 100644
--- a/uncoverml/config.py
+++ b/uncoverml/config.py
@@ -300,6 +300,8 @@ def __init__(self, yaml_file):
                 self.n_components = preprocessing_transforms[0]['whiten']['n_components']
             elif 'keep_fraction' in preprocessing_transforms[0]['whiten']:
                 self.keep_fraction = preprocessing_transforms[0]['whiten']['keep_fraction']
+            elif 'variation_fraction' in preprocessing_transforms[0]['whiten']:
+                self.variation_fraction = preprocessing_transforms[0]['whiten']['variation_fraction']
             else:
                 self.n_components = None
             if 'geotif' not in s['pca']:
diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index c7d632a8..71e4e6ee 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -442,7 +442,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
     image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs,
                                      outfile_tif,
                                      config.n_subchunks, config.output_dir,
-                                     band_tags=[f'_pc_{n}' for n in range(1, config.n_components+1)],
+                                     band_tags=[f'_pc_{n}' for n in range(1, whiten_transform.keepdims+1)],
                                      **config.geotif_options)
     for i in range(config.n_subchunks):
         log.info("starting to render partition {}".format(i+1))
diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py
index ebf3e4e3..0017da2f 100644
--- a/uncoverml/transforms/linear.py
+++ b/uncoverml/transforms/linear.py
@@ -80,17 +80,16 @@ def __call__(self, *args):
 
 
 class WhitenTransform:
-    def __init__(self, keep_fraction=None, n_components=None):
+    def __init__(self, keep_fraction=None, n_components=None, variation_fraction=None):
         self.mean = None
         self.eigvals = None
         self.eigvecs = None
         self.keep_fraction = keep_fraction
         self.n_components = n_components
-        if n_components is None:
-            assert keep_fraction is not None
-        if keep_fraction is None:
-            assert n_components is not None
-        assert (keep_fraction is None) or (n_components is None)
+        self.variation_fraction = variation_fraction
+        null_check = [x is None for x in [keep_fraction, n_components, variation_fraction]]
+        assert np.sum(list(null_check)) == 2, "only one of keep_fraction, n_components, " \
+                                              "variation_fraction can be non null"
         self.explained_ratio = {}
 
     def __call__(self, x):
@@ -103,16 +102,21 @@ def __call__(self, x):
             self.eigvals, self.eigvecs = mpiops.eigen_decomposition(x)
             ndims = x.shape[1]
             # make sure 1 <= keepdims <= ndims
-            if self.n_components is None:
+            self.explained_ratio = {
+                ndims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-ndims:])/np.sum(np.abs(
+                    self.eigvals)))
+            }
+
+            if self.n_components is None and self.variation_fraction is None:
                 self.keepdims = min(max(1, int(ndims * self.keep_fraction)), ndims)
+            elif self.n_components is None and self.keep_fraction is None:
+                explained_ratio_cum_sum = np.cumsum(list(self.explained_ratio.values())[::-1])
+                self.keepdims = np.searchsorted(explained_ratio_cum_sum > self.variation_fraction*100, True) + 1
+                log.info(f"Variation percentage {self.variation_fraction*100} requires only {self.keepdims} PCs")
             else:
                 self.keepdims = self.n_components
                 assert self.n_components < ndims, "More components demanded than features. Not possible! \n " \
                                                   "Please reduce n_components or increase the number of features"
-            self.explained_ratio = {
-                self.keepdims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-self.keepdims:])/np.sum(np.abs(
-                    self.eigvals)))
-            }
 
         mat = self.eigvecs[:, - self.keepdims:]
         vec = self.eigvals[np.newaxis, - self.keepdims:]

From 00d0f7d82bbf5da1c8abe186a884bfed577c0517 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Mon, 12 Dec 2022 18:09:44 +1100
Subject: [PATCH 25/33] pca writeout was missing apply masked functionality

---
 configs/reference_pca.yaml     | 8 +++++++-
 uncoverml/features.py          | 4 +++-
 uncoverml/mpiops.py            | 7 ++++++-
 uncoverml/predict.py           | 3 ++-
 uncoverml/scripts/uncoverml.py | 7 +++----
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml
index 284b26fd..46b76757 100644
--- a/configs/reference_pca.yaml
+++ b/configs/reference_pca.yaml
@@ -4,7 +4,7 @@ memory_fraction: 0.5
 
 features:
   - name: my continuous features
-    type: continuous
+    type: ordinal
     files:
       - path: configs/data/sirsam/er_depg.tif
       - path: configs/data/sirsam/sagawet_b_sir.tif
@@ -14,6 +14,8 @@ features:
       - path: configs/data/sirsam/relief_apsect.tif
       - path: configs/data/sirsam/LATITUDE_GRID1.tif
       - path: configs/data/sirsam/LONGITUDE_GRID1.tif
+    imputation: nn
+#      nodes: 5000
 
 preprocessing:
   transforms:
@@ -22,6 +24,10 @@ preprocessing:
 #        keep_fraction: 0.2  # one of keep fraction or n_components or variation_fraction allowed
         variation_fraction: 0.98
 
+mask:
+  file: /path/to/GA_data/GA-cover2/mask/old_mask_test.tif
+  retain: 1
+
 pca:
   geotif:
     TILED: YES
diff --git a/uncoverml/features.py b/uncoverml/features.py
index 3d7fcebc..ffa16da0 100644
--- a/uncoverml/features.py
+++ b/uncoverml/features.py
@@ -87,7 +87,9 @@ def transform_features(feature_sets, transform_sets, final_transform, config):
         if mpiops.chunk_index == 0 and config.pickle:
             log.info('Saving featurevec for reuse')
             pickle.dump(feature_vec, open(config.featurevec, 'wb'))
-
+    if mpiops.chunk_index == 0:
+        for i, f in enumerate(feature_sets[0].keys()):
+            log.debug(f"Using feature num {i} from: {f}")
     x = np.ma.concatenate(transformed_vectors, axis=1)
     if config.cubist or config.multicubist or config.krige:
         log.warning("{}: Ignoring preprocessing "
diff --git a/uncoverml/mpiops.py b/uncoverml/mpiops.py
index ad17814f..5c8e37a4 100644
--- a/uncoverml/mpiops.py
+++ b/uncoverml/mpiops.py
@@ -146,7 +146,11 @@ def outer(x):
     out = comm.allreduce(x_outer_local)
     still_masked = np.ma.count_masked(out)
     if still_masked != 0:
-        log.info('Reported out: ' + ', '.join([str(s) for s in out]))
+        log.info("=========still_masked =================")
+        if chunk_index == 0:
+            for s in out:
+                log.info(s)
+        # log.info('Reported out: ' + ', '.join([str(s) for s in out]))
         raise ValueError("Can't compute outer product:"
                          " completely missing columns!")
     if hasattr(out, 'mask'):
@@ -161,6 +165,7 @@ def covariance(x):
 
 
 def eigen_decomposition(x):
+    log.info(f"Eigenvalue matrix shape in process {chunk_index}: {x.shape}")
     eigvals, eigvecs = np.linalg.eigh(covariance(x))
     return eigvals, eigvecs
 
diff --git a/uncoverml/predict.py b/uncoverml/predict.py
index f8ba2e71..b3ead3a7 100644
--- a/uncoverml/predict.py
+++ b/uncoverml/predict.py
@@ -223,7 +223,8 @@ def export_pca(subchunk, image_out: geoio.ImageWriter, config: Config):
     log.info("Extracting PCAs....")
     # reverse x along columns which are eigen vectors so that the eigenvector corresponding to the largest eigenvalue
     # is the first PC. This will hopefully remove some ambiguity
-    image_out.write(np.flip(x, axis=1), subchunk)
+    x = apply_masked(lambda _x: _x, np.flip(x, axis=1))
+    image_out.write(x, subchunk)
 
 
 def export_pca_fractions(config: Config):
diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index 71e4e6ee..9f7af6ec 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -414,8 +414,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
 
     config.n_subchunks = partitions
     if config.n_subchunks > 1:
-        log.info("Memory contstraint forcing {} iterations "
-                 "through data".format(config.n_subchunks))
+        log.info("Memory contstraint forcing {} iterations through data".format(config.n_subchunks))
     else:
         log.info("Using memory aggressively: dividing all data between nodes")
 
@@ -427,7 +426,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
                                          transform_sets,
                                          config.final_transform,
                                          config)
-    print(f"process {ls.mpiops.chunk_index} has {features.shape[0]} samples")
+    features, _ = ls.features.remove_missing(features)
     num_samples = np.sum(ls.mpiops.comm.gather(features.shape[0]))
     log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of"
              f" {num_samples} points from the rasters")
@@ -442,7 +441,7 @@ def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
     image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs,
                                      outfile_tif,
                                      config.n_subchunks, config.output_dir,
-                                     band_tags=[f'_pc_{n}' for n in range(1, whiten_transform.keepdims+1)],
+                                     band_tags=[f'pc_{n}' for n in range(1, whiten_transform.keepdims+1)],
                                      **config.geotif_options)
     for i in range(config.n_subchunks):
         log.info("starting to render partition {}".format(i+1))

From 0686d0c6f266b49d1a85dbe2241b8dcacf001000 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Tue, 13 Dec 2022 06:51:37 +1100
Subject: [PATCH 26/33] remove all masked area treatment

---
 uncoverml/predict.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/uncoverml/predict.py b/uncoverml/predict.py
index b3ead3a7..d8ae5231 100644
--- a/uncoverml/predict.py
+++ b/uncoverml/predict.py
@@ -144,16 +144,16 @@ def _get_data(subchunk, config):
 
     # NOTE: This returns an *untransformed* x,
     # which is ok as we just need dummies here
-    if config.mask:
-        mask_x = _mask(subchunk, config)
-        all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
-        if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
-            x = np.ma.zeros((mask_x.shape[0], len(features_names)),
-                            dtype=np.bool)
-            x.mask = True
-            log.info('Partition {} covariates are not loaded as '
-                     'the partition is entirely masked.'.format(subchunk + 1))
-            return x, features_names
+    # if config.mask:
+    #     mask_x = _mask(subchunk, config)
+    #     all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
+    #     if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
+    #         x = np.ma.zeros((mask_x.shape[0], len(features_names)),
+    #                         dtype=np.bool)
+    #         x.mask = True
+    #         log.info('Partition {} covariates are not loaded as '
+    #                  'the partition is entirely masked.'.format(subchunk + 1))
+    #         return x, features_names
 
     transform_sets = [k.transform_set for k in config.feature_sets]
     extracted_chunk_sets = geoio.image_subchunks(subchunk, config)

From 81e7cd5d4af5d909b7c944a22050d75f6f0761ca Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Thu, 2 Feb 2023 18:19:47 +1100
Subject: [PATCH 27/33] some funny script to smooth elevation

---
 scripts/smooth_elevation.py | 64 +++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 scripts/smooth_elevation.py

diff --git a/scripts/smooth_elevation.py b/scripts/smooth_elevation.py
new file mode 100644
index 00000000..32dccbda
--- /dev/null
+++ b/scripts/smooth_elevation.py
@@ -0,0 +1,64 @@
+from pathlib import Path
+import numpy as np
+from scipy.interpolate import interp2d, RegularGridInterpolator
+import rasterio as rio
+from rasterio.io import DatasetReader
+smooth_dem =  Path("/home/sudipta/Documents/nci/smooth_dem")
+elevation = smooth_dem.joinpath("elevation_small.tif")
+ceno = smooth_dem.joinpath("ceno_small.tif")
+relief = smooth_dem.joinpath("relief_small.tif")
+
+with rio.open(elevation) as geotif:
+    profile: rio.profiles.Profile = geotif.profile
+    ele: DatasetReader = geotif.read(1, masked=True)
+
+with rio.open(relief) as geotif:
+    rel: DatasetReader = geotif.read(1, masked=True)
+
+with rio.open(ceno) as geotif:
+    cen: DatasetReader = geotif.read(1, masked=True)
+
+ele_mod = np.ma.array(data=ele.data, mask=(ele.mask | ((cen.data > 0.01) & (rel.data > 2.3))), fill_value=np.nan)
+ele_mod.data[ele_mod.mask] = np.nan
+# ele_mod.mask = ele.mask | ((cen.data > 0.01) & (rel.data > 0.01))
+
+
+def points_removed_interp():
+    z = ele_mod
+    x, y = np.mgrid[0:z.shape[0], 0:z.shape[1]]
+    x1 = x[~z.mask]
+    y1 = y[~z.mask]
+    z1 = z[~z.mask]
+    print(z1.shape)
+    return interp2d(x1, y1, z1, kind="linear")(np.arange(z.shape[0]), np.arange(z.shape[1]))
+
+
+def recatangular_interp():
+    z = ele_mod.filled(np.nan)
+    x = np.array(range(ele_mod.shape[0]))
+    y = np.array(range(ele_mod.shape[1]))
+    zinterp = RegularGridInterpolator((x, y), z, method="linear")
+    X2, Y2 = np.meshgrid(x, y)
+    newpoints = np.array((X2, Y2)).T
+
+    # actual interpolation
+    z2 = zinterp(newpoints)
+    z2_masked = np.ma.array(z2, mask=np.isnan(z2))
+    return z2_masked
+
+
+profile.update(compress='lzw')
+
+with rio.open(smooth_dem.joinpath("smooth_elevation_small_masked_ref_2p3.tif"), mode="w", **profile,) as update_dataset:
+    # out = recatangular_interp()
+    # out = points_removed_interp()
+    print(profile)
+    update_dataset.write(ele_mod, 1)
+
+
+# import IPython; IPython.embed(); import sys; sys.exit()
+# flat_ele = interp2d(x1, y1, z1)
+#    (np.arange(ele_mod.shape[0]), np.arange(ele_mod.shape[1]))
+
+
+

From cea9660f86e076d11b51be3ed01c58fcd1bdd74d Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Thu, 2 Feb 2023 18:20:08 +1100
Subject: [PATCH 28/33] bugfix

---
 uncoverml/predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/uncoverml/predict.py b/uncoverml/predict.py
index d8ae5231..1cc8d529 100644
--- a/uncoverml/predict.py
+++ b/uncoverml/predict.py
@@ -208,7 +208,7 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf
         y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config))
     except ValueError as v:
         log.warning(v)
-        x = _fix_for_corrupt_data(x, features_names)
+        x = _fix_for_corrupt_data(x, feature_names)
         y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config))
     if config.cluster and config.cluster_analysis:
         cluster_analysis(x, y_star, subchunk, config, feature_names)

From eaf39ab2090ca29ce327da016b3f1123893fb5a7 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Thu, 2 Feb 2023 18:20:49 +1100
Subject: [PATCH 29/33] wip - prediction on a different extent

---
 configs/ref_rf.yaml            |  1 +
 uncoverml/config.py            | 15 +++++++++------
 uncoverml/geoio.py             | 13 +++++++++----
 uncoverml/scripts/uncoverml.py |  5 ++++-
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml
index 2f3d8c89..cbac2e59 100644
--- a/configs/ref_rf.yaml
+++ b/configs/ref_rf.yaml
@@ -71,6 +71,7 @@ learning:
 
 
 prediction:
+#  prediction_template: configs/data/sirsam/dem_foc2.tif
   quantiles: 0.95
   outbands: 4
 
diff --git a/uncoverml/config.py b/uncoverml/config.py
index 3cf47323..8aa8bbf2 100644
--- a/uncoverml/config.py
+++ b/uncoverml/config.py
@@ -164,15 +164,18 @@ def __init__(self, yaml_file):
         self.krige = self.algorithm == 'krige'
 
         if 'prediction' in s:
-            self.quantiles = s['prediction']['quantiles']
-            self.geotif_options = s['prediction']['geotif'] if 'geotif' in \
-                s['prediction'] else {}
+            prediction = s["prediction"]
+            self.quantiles = prediction['quantiles']
+            self.geotif_options = prediction['geotif'] if 'geotif' in prediction else {}
             self.outbands = None
-            if 'outbands' in s['prediction']:
-                self.outbands = s['prediction']['outbands']
-            self.thumbnails = s['prediction']['thumbnails'] \
+            if 'outbands' in prediction:
+                self.outbands = prediction['outbands']
+            self.thumbnails = prediction['thumbnails'] \
                 if 'thumbnails' in s['prediction'] else 10
 
+            self.prediction_template = prediction["prediction_template"] \
+                if "prediction_template" in prediction else None
+
         if 'features' in s:
             self.pickle = any(True for d in s['features'] if d['type'] == 'pickle')
         else:
diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py
index 03c65d95..1a37547e 100644
--- a/uncoverml/geoio.py
+++ b/uncoverml/geoio.py
@@ -284,14 +284,17 @@ def load_targets(shapefile, targetfield, conf: Config):
     return targets
 
 
-def get_image_spec(model, config):
+def get_image_spec(model, config: Config):
     # temp workaround, we should have an image spec to check against
     nchannels = len(model.get_predict_tags())
     return get_image_spec_from_nchannels(nchannels, config)
 
 
-def get_image_spec_from_nchannels(nchannels, config):
-    imagelike = config.feature_sets[0].files[0]
+def get_image_spec_from_nchannels(nchannels, config: Config):
+    if config.prediction_template:
+        imagelike = Path(config.prediction_template).absolute()
+    else:
+        imagelike = config.feature_sets[0].files[0]
     template_image = image.Image(RasterioImageSource(imagelike))
     eff_shape = template_image.patched_shape(config.patchsize) + (nchannels,)
     eff_bbox = template_image.patched_bbox(config.patchsize)
@@ -476,9 +479,11 @@ def f(image_source):
     return result
 
 
-def image_subchunks(subchunk_index, config):
+def image_subchunks(subchunk_index, config: Config):
+    """This is used in prediction only"""
 
     def f(image_source):
+        # pred_image_source =
         r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize)
         return r
     result = _iterate_sources(f, config)
diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index 9f7af6ec..14d21832 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -326,7 +326,9 @@ def validate(pipeline_file, model_or_cluster_file, partitions):
               help='mask file used to limit prediction area')
 @click.option('-r', '--retain', type=int, default=None,
               help='mask values where to predict')
-def predict(model_or_cluster_file, partitions, mask, retain):
+@click.option('-t', '--prediction_template', type=click.Path(exists=True), default=None,
+              help='mask values where to predict')
+def predict(model_or_cluster_file, partitions, mask, retain, prediction_template):
 
     with open(model_or_cluster_file, 'rb') as f:
         state_dict = joblib.load(f)
@@ -336,6 +338,7 @@ def predict(model_or_cluster_file, partitions, mask, retain):
     config.cluster = True if splitext(model_or_cluster_file)[1] == '.cluster' \
         else False
     config.mask = mask if mask else config.mask
+    config.prediction_template = prediction_template if prediction_template else config.prediction_template
     if config.mask:
         config.retain = retain if retain else config.retain
 

From 79510bff201801ffa8c8dfcccf9379eb97ae8e38 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Wed, 8 Feb 2023 14:18:40 +1100
Subject: [PATCH 30/33] suppress wanrings

---
 uncoverml/scripts/uncoverml.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index 14d21832..dd396e34 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -14,6 +14,10 @@
 from pathlib import Path
 import warnings
 
+warnings.filterwarnings(action='ignore', category=FutureWarning)
+warnings.filterwarnings(action='ignore', category=DeprecationWarning)
+warnings.filterwarnings(action='ignore', category=Warning)
+
 import click
 import numpy as np
 import matplotlib
@@ -38,9 +42,6 @@
 
 log = logging.getLogger(__name__)
 # warnings.showwarning = warn_with_traceback
-warnings.filterwarnings(action='ignore', category=FutureWarning)
-warnings.filterwarnings(action='ignore', category=DeprecationWarning)
-warnings.filterwarnings(action='ignore', category=Warning)
 
 
 @click.group()

From 340adaab2d0437776a255d77a99dcfc09c94349c Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Wed, 8 Feb 2023 14:20:18 +1100
Subject: [PATCH 31/33] working - prediction on a different extent

---
 uncoverml/features.py |  3 ++-
 uncoverml/geoio.py    | 48 ++++++++++++++++++++++++++-----------------
 uncoverml/image.py    |  3 ++-
 3 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/uncoverml/features.py b/uncoverml/features.py
index ffa16da0..dd0cd2e9 100644
--- a/uncoverml/features.py
+++ b/uncoverml/features.py
@@ -11,11 +11,12 @@
 from uncoverml import patch
 from uncoverml import transforms
 from uncoverml.config import Config
+from uncoverml.geoio import RasterioImageSource
 
 log = logging.getLogger(__name__)
 
 
-def extract_subchunks(image_source, subchunk_index, n_subchunks, patchsize):
+def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize):
     equiv_chunks = n_subchunks * mpiops.chunks
     equiv_chunk_index = mpiops.chunks*subchunk_index + mpiops.chunk_index
     image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize)
diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py
index 1a37547e..ceeb3fd6 100644
--- a/uncoverml/geoio.py
+++ b/uncoverml/geoio.py
@@ -1,4 +1,5 @@
 from __future__ import division
+from typing import Optional
 import joblib
 import os.path
 from pathlib import Path
@@ -78,12 +79,17 @@ def crs(self):
 
 class RasterioImageSource(ImageSource):
 
-    def __init__(self, filename):
+    def __init__(self, filename, template_filename: Optional[str] = None):
 
         self.filename = filename
+        self.template = template_filename
         assert os.path.isfile(filename), '{} does not exist'.format(filename)
+
+        template_geotif = rasterio.open(template_filename, 'r') if self.template else None
+
         with rasterio.open(self.filename, 'r') as geotiff:
-            self._full_res = (geotiff.width, geotiff.height, geotiff.count)
+            self._full_res = (geotiff.width, geotiff.height, geotiff.count) if template_geotif is None else \
+                (template_geotif.width, template_geotif.height, template_geotif.count)
             self._nodata_value = geotiff.meta['nodata']
             # we don't support different channels with different dtypes
             for d in geotiff.dtypes[1:]:
@@ -93,7 +99,7 @@ def __init__(self, filename):
             self._dtype = np.dtype(geotiff.dtypes[0])
             self._crs = geotiff.crs
 
-            A = geotiff.transform
+            A = geotiff.transform if template_geotif is None else template_geotif.transform
             # No shearing or rotation allowed!!
             if not ((A[1] == 0) and (A[3] == 0)):
                 raise RuntimeError("Transform to pixel coordinates"
@@ -428,7 +434,7 @@ def output_thumbnails(self, ratio=10):
             resample(f, output_tif=thumbnail, ratio=ratio)
 
 
-def feature_names(config):
+def feature_names(config: Config):
 
     results = []
     for s in config.feature_sets:
@@ -444,24 +450,14 @@ def feature_names(config):
 def _iterate_sources(f, config):
 
     results = []
+    template_tif = config.prediction_template
     for s in config.feature_sets:
         extracted_chunks = {}
         for tif in s.files:
             name = os.path.abspath(tif)
-            image_source = RasterioImageSource(tif)
+            image_source = RasterioImageSource(tif, template_filename=template_tif)
             x = f(image_source)
-            # TODO this may hurt performance. Consider removal
-            if type(x) is np.ma.MaskedArray:
-                count = mpiops.count(x)
-                # if not np.all(count > 0):
-                #     s = ("{} has no data in at least one band.".format(name) +
-                #          " Valid_pixel_count: {}".format(count))
-                #     raise ValueError(s)
-                missing_percent = missing_percentage(x)
-                t_missing = mpiops.comm.allreduce(
-                    missing_percent) / mpiops.chunks
-                log.info("{}: {}px {:2.2f}% missing".format(
-                    name, count, t_missing))
+            log_missing_percentage(name, x)
             extracted_chunks[name] = x
         extracted_chunks = OrderedDict(sorted(
             extracted_chunks.items(), key=lambda t: t[0]))
@@ -470,6 +466,21 @@ def _iterate_sources(f, config):
     return results
 
 
+def log_missing_percentage(name, x):
+    # TODO this may hurt performance. Consider removal
+    if type(x) is np.ma.MaskedArray:
+        count = mpiops.count(x)
+        # if not np.all(count > 0):
+        #     s = ("{} has no data in at least one band.".format(name) +
+        #          " Valid_pixel_count: {}".format(count))
+        #     raise ValueError(s)
+        missing_percent = missing_percentage(x)
+        t_missing = mpiops.comm.allreduce(
+            missing_percent) / mpiops.chunks
+        log.info("{}: {}px {:2.2f}% missing".format(
+            name, count, t_missing))
+
+
 def image_resolutions(config):
     def f(image_source):
         r = image_source._full_res
@@ -482,8 +493,7 @@ def f(image_source):
 def image_subchunks(subchunk_index, config: Config):
     """This is used in prediction only"""
 
-    def f(image_source):
-        # pred_image_source =
+    def f(image_source: RasterioImageSource):
         r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize)
         return r
     result = _iterate_sources(f, config)
diff --git a/uncoverml/image.py b/uncoverml/image.py
index 395c23c9..3ad1912e 100644
--- a/uncoverml/image.py
+++ b/uncoverml/image.py
@@ -2,6 +2,7 @@
 import logging
 
 from affine import Affine
+from uncoverml.geoio import RasterioImageSource
 
 log = logging.getLogger(__name__)
 
@@ -27,7 +28,7 @@ def construct_splits(npixels, nchunks, overlap=0):
 
 
 class Image:
-    def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0):
+    def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap=0):
         assert chunk_idx >= 0 and chunk_idx < nchunks
 
         if nchunks == 1 and overlap != 0:

From a055c1dc1631fc1995e90432f22ad79988023666 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Wed, 8 Feb 2023 17:52:52 +1100
Subject: [PATCH 32/33] bug fix cropped prediction

---
 uncoverml/config.py            |  2 ++
 uncoverml/geoio.py             | 12 ++++++------
 uncoverml/patch.py             |  5 +++--
 uncoverml/scripts/uncoverml.py |  1 +
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/uncoverml/config.py b/uncoverml/config.py
index 8aa8bbf2..7e565e58 100644
--- a/uncoverml/config.py
+++ b/uncoverml/config.py
@@ -176,6 +176,8 @@ def __init__(self, yaml_file):
             self.prediction_template = prediction["prediction_template"] \
                 if "prediction_template" in prediction else None
 
+        self.is_prediction = False
+
         if 'features' in s:
             self.pickle = any(True for d in s['features'] if d['type'] == 'pickle')
         else:
diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py
index ceeb3fd6..a57d06d2 100644
--- a/uncoverml/geoio.py
+++ b/uncoverml/geoio.py
@@ -82,14 +82,14 @@ class RasterioImageSource(ImageSource):
     def __init__(self, filename, template_filename: Optional[str] = None):
 
         self.filename = filename
-        self.template = template_filename
         assert os.path.isfile(filename), '{} does not exist'.format(filename)
 
-        template_geotif = rasterio.open(template_filename, 'r') if self.template else None
+        template_geotiff = rasterio.open(template_filename, 'r') if template_filename else None
 
         with rasterio.open(self.filename, 'r') as geotiff:
-            self._full_res = (geotiff.width, geotiff.height, geotiff.count) if template_geotif is None else \
-                (template_geotif.width, template_geotif.height, template_geotif.count)
+            tiff = geotiff if template_geotiff is None else template_geotiff
+
+            self._full_res = (geotiff.width, geotiff.height, geotiff.count)
             self._nodata_value = geotiff.meta['nodata']
             # we don't support different channels with different dtypes
             for d in geotiff.dtypes[1:]:
@@ -99,7 +99,7 @@ def __init__(self, filename, template_filename: Optional[str] = None):
             self._dtype = np.dtype(geotiff.dtypes[0])
             self._crs = geotiff.crs
 
-            A = geotiff.transform if template_geotif is None else template_geotif.transform
+            A = tiff.transform
             # No shearing or rotation allowed!!
             if not ((A[1] == 0) and (A[3] == 0)):
                 raise RuntimeError("Transform to pixel coordinates"
@@ -450,7 +450,7 @@ def feature_names(config: Config):
 def _iterate_sources(f, config):
 
     results = []
-    template_tif = config.prediction_template
+    template_tif = config.prediction_template if config.is_prediction else None
     for s in config.feature_sets:
         extracted_chunks = {}
         for tif in s.files:
diff --git a/uncoverml/patch.py b/uncoverml/patch.py
index 028cce5d..e7e7f4df 100644
--- a/uncoverml/patch.py
+++ b/uncoverml/patch.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import skimage
+from uncoverml.image import Image
 
 
 log = logging.getLogger(__name__)
@@ -80,7 +81,7 @@ def point_patches(image, pwidth, points):
     return output
 
 
-def _image_to_data(image):
+def _image_to_data(image: Image):
     """
     breaks up an image object into arrays suitable for sending to the
     patching functions
@@ -92,7 +93,7 @@ def _image_to_data(image):
     return data, mask, data_dtype
 
 
-def all_patches(image, patchsize):
+def all_patches(image: Image, patchsize):
     data, mask, data_dtype = _image_to_data(image)
     patches = grid_patches(data, patchsize)
     patch_mask = grid_patches(mask, patchsize)
diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index dd396e34..fd54218a 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -336,6 +336,7 @@ def predict(model_or_cluster_file, partitions, mask, retain, prediction_template
 
     model = state_dict["model"]
     config = state_dict["config"]
+    config.is_prediction = True
     config.cluster = True if splitext(model_or_cluster_file)[1] == '.cluster' \
         else False
     config.mask = mask if mask else config.mask

From 45941831e1d1e3c49df60eab607c52b4b7568248 Mon Sep 17 00:00:00 2001
From: Sudipta Basak <basaks@gmail.com>
Date: Thu, 9 Feb 2023 11:04:38 +1100
Subject: [PATCH 33/33] prediction on a different extent finally working?

---
 configs/ref_rf.yaml   | 14 +++++++-------
 uncoverml/features.py |  6 ++++--
 uncoverml/geoio.py    | 22 +++++++++++++---------
 uncoverml/image.py    | 38 +++++++++++++++++++++++++++-----------
 4 files changed, 51 insertions(+), 29 deletions(-)

diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml
index cbac2e59..bdfbf1c0 100644
--- a/configs/ref_rf.yaml
+++ b/configs/ref_rf.yaml
@@ -14,19 +14,19 @@ features:
       - path: configs/data/sirsam/k_15v5.tif
       - path: configs/data/sirsam/relief_apsect.tif
     transforms:
-      - standardise
-      - whiten:
-          keep_fraction: 0.8
+#      - standardise
+#      - whiten:
+#          keep_fraction: 0.8
     imputation: none
 
 preprocessing:
   imputation: none
   transforms:
-    - whiten:
-        keep_fraction: 0.8
+#    - whiten:
+#        keep_fraction: 0.8
 
 targets:
-  file: configs/data/geochem_sites.shp
+  file: configs/data/geochem_sites_cropped.shp
   property: K_ppm_imp
 #  group_targets:
 #    groups_eps: 0.09
@@ -71,7 +71,7 @@ learning:
 
 
 prediction:
-#  prediction_template: configs/data/sirsam/dem_foc2.tif
+  prediction_template: configs/data/sirsam/dem_foc2.tif
   quantiles: 0.95
   outbands: 4
 
diff --git a/uncoverml/features.py b/uncoverml/features.py
index dd0cd2e9..161e3804 100644
--- a/uncoverml/features.py
+++ b/uncoverml/features.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional
 from collections import OrderedDict
 import numpy as np
 import pickle
@@ -16,10 +17,11 @@
 log = logging.getLogger(__name__)
 
 
-def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize):
+def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize,
+                      template_source: Optional[RasterioImageSource] = None):
     equiv_chunks = n_subchunks * mpiops.chunks
     equiv_chunk_index = mpiops.chunks*subchunk_index + mpiops.chunk_index
-    image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize)
+    image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize, template_source)
     x = patch.all_patches(image, patchsize)
     return x
 
diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py
index a57d06d2..1d477230 100644
--- a/uncoverml/geoio.py
+++ b/uncoverml/geoio.py
@@ -79,15 +79,12 @@ def crs(self):
 
 class RasterioImageSource(ImageSource):
 
-    def __init__(self, filename, template_filename: Optional[str] = None):
+    def __init__(self, filename):
 
         self.filename = filename
         assert os.path.isfile(filename), '{} does not exist'.format(filename)
 
-        template_geotiff = rasterio.open(template_filename, 'r') if template_filename else None
-
         with rasterio.open(self.filename, 'r') as geotiff:
-            tiff = geotiff if template_geotiff is None else template_geotiff
 
             self._full_res = (geotiff.width, geotiff.height, geotiff.count)
             self._nodata_value = geotiff.meta['nodata']
@@ -99,7 +96,7 @@ def __init__(self, filename, template_filename: Optional[str] = None):
             self._dtype = np.dtype(geotiff.dtypes[0])
             self._crs = geotiff.crs
 
-            A = tiff.transform
+            A = geotiff.transform
             # No shearing or rotation allowed!!
             if not ((A[1] == 0) and (A[3] == 0)):
                 raise RuntimeError("Transform to pixel coordinates"
@@ -297,7 +294,7 @@ def get_image_spec(model, config: Config):
 
 
 def get_image_spec_from_nchannels(nchannels, config: Config):
-    if config.prediction_template:
+    if config.prediction_template and config.is_prediction:
         imagelike = Path(config.prediction_template).absolute()
     else:
         imagelike = config.feature_sets[0].files[0]
@@ -447,15 +444,17 @@ def feature_names(config: Config):
     return results
 
 
-def _iterate_sources(f, config):
+def _iterate_sources(f, config: Config):
 
     results = []
     template_tif = config.prediction_template if config.is_prediction else None
+    if config.is_prediction:
+        log.info(f"Using prediction template {config.prediction_template}")
     for s in config.feature_sets:
         extracted_chunks = {}
         for tif in s.files:
             name = os.path.abspath(tif)
-            image_source = RasterioImageSource(tif, template_filename=template_tif)
+            image_source = RasterioImageSource(tif)
             x = f(image_source)
             log_missing_percentage(name, x)
             extracted_chunks[name] = x
@@ -494,7 +493,12 @@ def image_subchunks(subchunk_index, config: Config):
     """This is used in prediction only"""
 
     def f(image_source: RasterioImageSource):
-        r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize)
+        if config.is_prediction and config.prediction_template is not None:
+            template_source = RasterioImageSource(config.prediction_template)
+        else:
+            template_source = None
+        r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize,
+                                       template_source=template_source)
         return r
     result = _iterate_sources(f, config)
     return result
diff --git a/uncoverml/image.py b/uncoverml/image.py
index 3ad1912e..0f4494d4 100644
--- a/uncoverml/image.py
+++ b/uncoverml/image.py
@@ -1,3 +1,4 @@
+from typing import Optional
 import numpy as np
 import logging
 
@@ -28,7 +29,10 @@ def construct_splits(npixels, nchunks, overlap=0):
 
 
 class Image:
-    def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap=0):
+    def __init__(self, source: RasterioImageSource,
+                 chunk_idx=0, nchunks=1, overlap=0,
+                 t_source: Optional[RasterioImageSource] = None,
+                 ):
         assert chunk_idx >= 0 and chunk_idx < nchunks
 
         if nchunks == 1 and overlap != 0:
@@ -49,6 +53,10 @@ def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap=
         self.pixsize_x = source.pixsize_x
         self.pixsize_y = source.pixsize_y
         self.crs = source.crs
+        log.debug(f"Image full resolution : {source.full_resolution}")
+        log.debug(f"Image origin longitude : {source.origin_longitude}")
+        log.debug(f"Image origin latitude : {source.origin_latitude}")
+
         assert self.pixsize_x > 0
         assert self.pixsize_y > 0
 
@@ -65,21 +73,29 @@ def __init__(self, source: RasterioImageSource, chunk_idx=0, nchunks=1, overlap=
         self._pix_y_to_coords = dict(zip(pix_y, coords_y))
 
         # exclusive y range of this chunk in full image
-        ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx]
-        self._offset = np.array([0, ymin], dtype=int)
-        # exclusive x range of this chunk (same for all chunks)
-        xmin, xmax = 0, self._full_res[0]
+        if t_source:
+            self._t_full_res = t_source.full_resolution
+            self._t_start_lon = t_source.origin_longitude
+            self._t_start_lat = t_source.origin_latitude
+            self._t_pixsize_x = t_source.pixsize_x
+            self._t_pixsize_y = t_source.pixsize_y
+            tymin, tymax = construct_splits(self._t_full_res[1], nchunks, overlap)[chunk_idx]
+            t_x_offset = np.searchsorted(self._coords_x, self._t_start_lon)
+            t_y_offset = np.searchsorted(self._coords_y, self._t_start_lat)
+            xmin, xmax = t_x_offset, t_x_offset + self._t_full_res[0]
+            ymin, ymax = t_y_offset + tymin, t_y_offset + tymax
+            self.resolution = (xmax-xmin, ymax - ymin, self._t_full_res[2])
+        else:
+            # exclusive x range of this chunk (same for all chunks)
+            xmin, xmax = 0, self._full_res[0]
+            ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx]
+            self.resolution = (xmax - xmin, ymax - ymin, self._full_res[2])
 
+        self._offset = np.array([xmin, ymin], dtype=int)
         assert(xmin < xmax)
         assert(ymin < ymax)
 
-        # get resolution of this chunk
-        xres = self._full_res[0]
-        yres = ymax - ymin
-
         # Calculate the new values for resolution and bounding box
-        self.resolution = (xres, yres, self._full_res[2])
-
         start_bound_x, start_bound_y = self._global_pix2lonlat(
             np.array([[xmin, ymin]]))[0]
         # one past the last pixel