Merge remote-tracking branch 'origin/data-wrangling' into sudipta-merge

# Conflicts: # uncoverml/config.py # uncoverml/predict.py
GeoscienceAustralia · May 18, 2023 · bc440b6 · bc440b6
2 parents df3f699 + 4594183
commit bc440b6
Show file tree

Hide file tree

Showing 26 changed files with 1,005 additions and 287 deletions.
diff --git a/configs/demo_regression.yaml b/configs/demo_regression.yaml
@@ -33,7 +33,6 @@ mask:
 # imputation options are gaus, nn, mean
 preprocessing:
   imputation: mean
-
   transforms:
     # - whiten:
         # keep_fraction: 0.8

diff --git a/configs/group_and_split_shapefile.yaml b/configs/group_and_split_shapefile.yaml
diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml
@@ -14,19 +14,19 @@ features:
       - path: configs/data/sirsam/k_15v5.tif
       - path: configs/data/sirsam/relief_apsect.tif
     transforms:
-      - standardise
-      - whiten:
-          keep_fraction: 0.8
+#      - standardise
+#      - whiten:
+#          keep_fraction: 0.8
     imputation: none
 
 preprocessing:
   imputation: none
   transforms:
-    - whiten:
-        keep_fraction: 0.8
+#    - whiten:
+#        keep_fraction: 0.8
 
 targets:
-  file: configs/data/geochem_sites.shp
+  file: configs/data/geochem_sites_cropped.shp
   property: K_ppm_imp
 #  group_targets:
 #    groups_eps: 0.09
@@ -39,22 +39,39 @@ learning:
     random_state: 1
     max_depth: 20
   optimisation:
-    searchcv_params:
-      n_iter: 6
+    hyperopt_params:
+      max_evals: 5
+      step: 2
       cv: 2
-      verbose: 1000
-      n_points: 3
-      n_jobs: 6
-    params_space:
-      'max_depth': Integer(1, 15)
-      'n_estimators': Integer(10, 100)
-      'max_features': Categorical(['auto', 'sqrt', 'log2'])
-      'min_samples_split': Integer(2, 50)
-      'min_samples_leaf': Integer(1, 50)
-      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
+      verbose: true
+      random_state: 3
+      scoring: r2  # r2, neg_mean_absolute_error, etc..see note above
+      algo: bayes   # bayes, or anneal
+    hp_params_space:
+      max_depth: randint('max_depth', 1, 15)
+      n_estimators: randint('n_estimators', 5, 25)
+      max_features: choice('max_features', ['auto', 'sqrt', 'log2'])
+      min_samples_split: randint('min_samples_split', 2, 50)
+      min_samples_leaf: randint('min_samples_leaf', 1, 50)
+      min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
+      max_leaf_nodes: randint('max_leaf_nodes', 10, 50)
+#    searchcv_params:
+#      n_iter: 6
+#      cv: 2
+#      verbose: 1000
+#      n_points: 3
+#      n_jobs: 6
+#    params_space:
+#      'max_depth': Integer(1, 15)
+#      'n_estimators': Integer(10, 100)
+#      'max_features': Categorical(['auto', 'sqrt', 'log2'])
+#      'min_samples_split': Integer(2, 50)
+#      'min_samples_leaf': Integer(1, 50)
+#      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
 
 
 prediction:
+  prediction_template: configs/data/sirsam/dem_foc2.tif
   quantiles: 0.95
   outbands: 4
 

diff --git a/configs/reference_gb.yaml b/configs/reference_gb.yaml
@@ -58,17 +58,17 @@ learning:
   algorithm: gradientboost
   arguments:
     target_transform: identity
-    loss: 'ls'
-    max_depth: 20
-    learning_rate: 0.1
-    n_estimators: 100
-    subsample: 0.9
-    min_samples_split: 2
-    min_samples_leaf: 2
-    min_weight_fraction_leaf: 0.0
-    max_features: "auto"
-    alpha: 0.95
-    random_state: 3
+#    loss: 'ls'
+#    max_depth: 20
+#    learning_rate: 0.1
+    n_estimators: 200
+#    subsample: 0.9
+#    min_samples_split: 2
+#    min_samples_leaf: 2
+#    min_weight_fraction_leaf: 0.0
+#    max_features: "auto"
+#    alpha: 0.95
+#    random_state: 3
   optimisation:
     hyperopt_params:
       max_evals: 5
@@ -89,22 +89,6 @@ learning:
       min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
       max_leaf_nodes: randint('max_leaf_nodes', 10, 50)
 
-    searchcv_params:
-      n_iter: 6
-      cv: 2
-      verbose: 1000
-      n_points: 3
-      n_jobs: 6
-    params_space:
-      'max_depth': Integer(1, 15)
-      'learning_rate': Real(10 ** -5, 10 ** 0, prior="log-uniform")
-      'n_estimators': Integer(10, 100)
-      'subsample': Real(0.01, 1.0, prior='uniform')
-      'max_features': Categorical(['auto', 'sqrt', 'log2'])
-      'min_samples_split': Integer(2, 50)
-      'min_samples_leaf': Integer(1, 50)
-      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
-      'max_leaf_nodes': Integer(10, 50)
 
 prediction:
 #  corner_coordinates:
@@ -114,10 +98,10 @@ prediction:
   outbands: 1
 
 validation:
-  #- feature_rank
+  - feature_rank
   - parallel
   - k-fold:
-      folds: 3
+      folds: 5
       random_seed: 1
 
 output:

diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml
@@ -4,7 +4,7 @@ memory_fraction: 0.5
 
 features:
   - name: my continuous features
-    type: continuous
+    type: ordinal
     files:
       - path: configs/data/sirsam/er_depg.tif
       - path: configs/data/sirsam/sagawet_b_sir.tif
@@ -14,11 +14,19 @@ features:
       - path: configs/data/sirsam/relief_apsect.tif
       - path: configs/data/sirsam/LATITUDE_GRID1.tif
       - path: configs/data/sirsam/LONGITUDE_GRID1.tif
+    imputation: nn
+#      nodes: 5000
 
 preprocessing:
   transforms:
     - whiten:
-        n_components: 4
+#        n_components: 2
+#        keep_fraction: 0.2  # one of keep fraction or n_components or variation_fraction allowed
+        variation_fraction: 0.98
+
+mask:
+  file: /path/to/GA_data/GA-cover2/mask/old_mask_test.tif
+  retain: 1
 
 pca:
   geotif:

diff --git a/configs/reference_xgboost.yaml b/configs/reference_xgboost.yaml
@@ -1,26 +1,29 @@
-
 experiment: my_run
 patchsize: 0
 memory_fraction: 0.5
 
 features:
-  - name: my continuous features
+  - name: my continuous features 2
     type: continuous
     files:
-      - path: configs/data/sirsam/er_depg.tif
-      - path: configs/data/sirsam/sagawet_b_sir.tif
       - path: configs/data/sirsam/dem_foc2.tif
       - path: configs/data/sirsam/outcrop_dis2.tif
-#      - path: configs/data/sirsam/k_15v5.tif
       - path: configs/data/sirsam/relief_apsect.tif
-#      - path: configs/data/sirsam/LATITUDE_GRID1.tif
-#      - path: configs/data/sirsam/LONGITUDE_GRID1.tif
-#      - directory: configs/data/weights/
     transforms:
       - identity
     #      - whiten:
     #          keep_fraction: 0.98
-    imputation: none
+    imputation: mean
+  - name: my continuous features 1
+    type: continuous
+    files:
+      - path: configs/data/sirsam/er_depg.tif
+      - path: configs/data/sirsam/sagawet_b_sir.tif
+    transforms:
+      - identity
+    #      - whiten:
+    #          keep_fraction: 0.98
+    imputation: gauss
 
 preprocessing:
   imputation: none
@@ -125,7 +128,7 @@ prediction:
   outbands: 1
 
 validation:
-  #- feature_rank
+#  - feature_rank
   - parallel
   - shapley
 #  - permutation_importance
@@ -135,9 +138,8 @@ validation:
       random_seed: 1
 
 oos_validation:
-  file: configs/data/weights/Gamma_K_50.shp
-  property: K2O
+  file: configs/data/geochem_sites.shp
+  property: K_ppm_imp
 
 output:
   directory: ref_xgb/
-
diff --git a/configs/resampling.yaml b/configs/resampling.yaml
@@ -2,8 +2,8 @@
 #  when undersample == true, we choose points with or without bootstrap
 
 targets:
-  file: configs/data/weights/Gamma_K_50.shp
-  property: K2O
+  file: configs/data/sirsam/out_resampled/geochem_sites.shp
+  property: K_ppm_imp
   resample:
     spatial:
       undersample: false

diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py
@@ -0,0 +1,61 @@
+import subprocess
+from joblib import delayed, Parallel
+from pathlib import Path
+import rasterio as rio
+from rasterio.io import DatasetReader
+import numpy as np
+import pandas as pd
+
+dir = "configs/data/sirsam"
+
+mask = Path(dir).joinpath("dem_foc2.tif")
+
+with rio.open(mask) as geotif:
+    mask_raster = geotif.read(1, masked=True)
+
+
+
+def _parallel_read(r: Path):
+    try:
+        with rio.open(r) as geotif:
+            raster: DatasetReader = geotif.read(1, masked=True)
+            m = geotif.meta
+            m['crs'] = m['crs'].to_string()
+            t = m.pop('transform')
+            m['pixsize_x'] = t[0]
+            m['top_left_x'] = t[2]
+            m['pixsize_y'] = -t[4]
+            m['top_left_y'] = t[5]
+            raster.mask = raster.mask | mask_raster.mask  # we are not interested in masked areas
+            # print(raster)
+            m['all_finite'] = np.all(np.isfinite(raster))
+            m['any_nan'] = np.any(np.isnan(raster))
+            m['any_large'] = np.any(np.abs(raster) > 1e10)
+            m['min'] = np.ma.min(raster)
+            m['mean'] = np.ma.mean(raster)
+            m['median'] = np.ma.median(raster)
+            m['max'] = np.ma.max(raster)
+            m['std'] = np.ma.std(raster)
+            m['skew'] = 3 * (m['mean'] - m['median']) / m['std']
+            # subprocess.run(f"gdalinfo {r.as_posix()} -stats", shell=True, capture_output=True)
+            # raster_attrs[r.stem] = m
+        return m
+    except Exception as e:
+        print(r)
+        print(e)
+        return [None] * 14
+
+
+rets = Parallel(
+    n_jobs=1,
+    verbose=100,
+)(delayed(_parallel_read)(r) for r in Path(dir).glob("**/*.tif"))
+
+import pickle
+with open("rets.pk", 'wb') as f:
+    pickle.dump(rets, f)
+
+raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("**/*.tif",), rets)}
+
+df = pd.DataFrame.from_dict(raster_attrs)
+df.to_csv("quality.csv")
diff --git a/scripts/dedupe_shape.py b/scripts/dedupe_shape.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+from joblib import Parallel, delayed
+import numpy as np
+import pandas as pd
+import rasterio
+import geopandas as gpd
+
+
+def dedupe_raster(shp: Path, tif: Path, deduped_shp: Path):
+    """
+    :param shp: input shapefile with dense points
+    :param tif: sample tif to read resolution details
+    :param deduped_shp: output shapefile with one point per down-sampled raster resolution
+    :return:
+    """
+    print("====================================\n", f"deduping {shp.as_posix()}")
+    geom_cols = ['POINT_X', 'POINT_Y']
+    pts = gpd.read_file(shp)
+    for g in geom_cols:
+        if g in pts.columns:
+            pts = pts.drop(g, axis=1)
+    coords = np.array([(p.x, p.y) for p in pts.geometry])
+    geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+    pts = pts.merge(geom, left_index=True, right_index=True)
+
+    with rasterio.open(tif) as src:
+        # resample data to target shape
+        data = src.read(
+            out_shape=(
+                src.count,
+                int(src.height / downscale_factor),
+                int(src.width / downscale_factor)
+            ),
+            resampling=rasterio.enums.Resampling.bilinear
+        )
+        # scale image transform
+        transform = src.transform * src.transform.scale(
+            (src.width / data.shape[-1]),
+            (src.height / data.shape[-2])
+        )
+        pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
+
+    pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count'))
+    pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean()
+    pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
+
+    pts_deduped = gpd.GeoDataFrame(pts_deduped,
+                                   geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y']),
+                                   crs="EPSG:3577"   # Australian Albers
+                                   )
+    pts_deduped.to_file(deduped_shp.as_posix())
+    return pts_deduped
+
+
+if __name__ == '__main__':
+    shapefiles = Path("configs/data/")
+    downscale_factor = 6  # keep 1 point in a 6x6 cell
+
+    dem = Path('/home/my_dem.tif')
+    output_dir = Path('1in6')
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    # for s in shapefiles.glob("*.shp"):
+    #     deduped_shp = output_dir.joinpath(s.name)
+    #     dedupe_raster(shp=s, tif=dem, deduped_shp=deduped_shp)
+
+    Parallel(
+            n_jobs=-1,
+            verbose=100,
+        )(delayed(dedupe_raster)(s, dem, output_dir.joinpath(s.name)) for s in shapefiles.glob("geochem_sites.shp"))