diff --git a/configs/demo_regression.yaml b/configs/demo_regression.yaml
index eda84ce0..c120e7aa 100644
--- a/configs/demo_regression.yaml
+++ b/configs/demo_regression.yaml
@@ -33,7 +33,6 @@ mask:
 # imputation options are gaus, nn, mean
 preprocessing:
   imputation: mean
-
   transforms:
     # - whiten:
         # keep_fraction: 0.8
diff --git a/configs/group_and_split_shapefile.yaml b/configs/group_and_split_shapefile.yaml
old mode 100755
new mode 100644
diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml
index abdb678f..bdfbf1c0 100644
--- a/configs/ref_rf.yaml
+++ b/configs/ref_rf.yaml
@@ -14,19 +14,19 @@ features:
       - path: configs/data/sirsam/k_15v5.tif
       - path: configs/data/sirsam/relief_apsect.tif
     transforms:
-      - standardise
-      - whiten:
-          keep_fraction: 0.8
+#      - standardise
+#      - whiten:
+#          keep_fraction: 0.8
     imputation: none
 
 preprocessing:
   imputation: none
   transforms:
-    - whiten:
-        keep_fraction: 0.8
+#    - whiten:
+#        keep_fraction: 0.8
 
 targets:
-  file: configs/data/geochem_sites.shp
+  file: configs/data/geochem_sites_cropped.shp
   property: K_ppm_imp
 #  group_targets:
 #    groups_eps: 0.09
@@ -39,22 +39,39 @@ learning:
     random_state: 1
     max_depth: 20
   optimisation:
-    searchcv_params:
-      n_iter: 6
+    hyperopt_params:
+      max_evals: 5
+      step: 2
       cv: 2
-      verbose: 1000
-      n_points: 3
-      n_jobs: 6
-    params_space:
-      'max_depth': Integer(1, 15)
-      'n_estimators': Integer(10, 100)
-      'max_features': Categorical(['auto', 'sqrt', 'log2'])
-      'min_samples_split': Integer(2, 50)
-      'min_samples_leaf': Integer(1, 50)
-      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
+      verbose: true
+      random_state: 3
+      scoring: r2  # r2, neg_mean_absolute_error, etc..see note above
+      algo: bayes   # bayes, or anneal
+    hp_params_space:
+      max_depth: randint('max_depth', 1, 15)
+      n_estimators: randint('n_estimators', 5, 25)
+      max_features: choice('max_features', ['auto', 'sqrt', 'log2'])
+      min_samples_split: randint('min_samples_split', 2, 50)
+      min_samples_leaf: randint('min_samples_leaf', 1, 50)
+      min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
+      max_leaf_nodes: randint('max_leaf_nodes', 10, 50)
+#    searchcv_params:
+#      n_iter: 6
+#      cv: 2
+#      verbose: 1000
+#      n_points: 3
+#      n_jobs: 6
+#    params_space:
+#      'max_depth': Integer(1, 15)
+#      'n_estimators': Integer(10, 100)
+#      'max_features': Categorical(['auto', 'sqrt', 'log2'])
+#      'min_samples_split': Integer(2, 50)
+#      'min_samples_leaf': Integer(1, 50)
+#      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
 
 
 prediction:
+  prediction_template: configs/data/sirsam/dem_foc2.tif
   quantiles: 0.95
   outbands: 4
 
diff --git a/configs/reference_gb.yaml b/configs/reference_gb.yaml
index 2c1717a8..c5f7f787 100644
--- a/configs/reference_gb.yaml
+++ b/configs/reference_gb.yaml
@@ -58,17 +58,17 @@ learning:
   algorithm: gradientboost
   arguments:
     target_transform: identity
-    loss: 'ls'
-    max_depth: 20
-    learning_rate: 0.1
-    n_estimators: 100
-    subsample: 0.9
-    min_samples_split: 2
-    min_samples_leaf: 2
-    min_weight_fraction_leaf: 0.0
-    max_features: "auto"
-    alpha: 0.95
-    random_state: 3
+#    loss: 'ls'
+#    max_depth: 20
+#    learning_rate: 0.1
+    n_estimators: 200
+#    subsample: 0.9
+#    min_samples_split: 2
+#    min_samples_leaf: 2
+#    min_weight_fraction_leaf: 0.0
+#    max_features: "auto"
+#    alpha: 0.95
+#    random_state: 3
   optimisation:
     hyperopt_params:
       max_evals: 5
@@ -89,22 +89,6 @@ learning:
       min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
       max_leaf_nodes: randint('max_leaf_nodes', 10, 50)
 
-    searchcv_params:
-      n_iter: 6
-      cv: 2
-      verbose: 1000
-      n_points: 3
-      n_jobs: 6
-    params_space:
-      'max_depth': Integer(1, 15)
-      'learning_rate': Real(10 ** -5, 10 ** 0, prior="log-uniform")
-      'n_estimators': Integer(10, 100)
-      'subsample': Real(0.01, 1.0, prior='uniform')
-      'max_features': Categorical(['auto', 'sqrt', 'log2'])
-      'min_samples_split': Integer(2, 50)
-      'min_samples_leaf': Integer(1, 50)
-      'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
-      'max_leaf_nodes': Integer(10, 50)
 
 prediction:
 #  corner_coordinates:
@@ -114,10 +98,10 @@ prediction:
   outbands: 1
 
 validation:
-  #- feature_rank
+  - feature_rank
   - parallel
   - k-fold:
-      folds: 3
+      folds: 5
       random_seed: 1
 
 output:
diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml
index aea2c3c9..46b76757 100644
--- a/configs/reference_pca.yaml
+++ b/configs/reference_pca.yaml
@@ -4,7 +4,7 @@ memory_fraction: 0.5
 
 features:
   - name: my continuous features
-    type: continuous
+    type: ordinal
     files:
       - path: configs/data/sirsam/er_depg.tif
       - path: configs/data/sirsam/sagawet_b_sir.tif
@@ -14,11 +14,19 @@ features:
       - path: configs/data/sirsam/relief_apsect.tif
       - path: configs/data/sirsam/LATITUDE_GRID1.tif
       - path: configs/data/sirsam/LONGITUDE_GRID1.tif
+    imputation: nn
+#      nodes: 5000
 
 preprocessing:
   transforms:
     - whiten:
-        n_components: 4
+#        n_components: 2
+#        keep_fraction: 0.2  # one of keep fraction or n_components or variation_fraction allowed
+        variation_fraction: 0.98
+
+mask:
+  file: /path/to/GA_data/GA-cover2/mask/old_mask_test.tif
+  retain: 1
 
 pca:
   geotif:
diff --git a/configs/reference_xgboost.yaml b/configs/reference_xgboost.yaml
index 6c29d0e6..bf5fb8e0 100644
--- a/configs/reference_xgboost.yaml
+++ b/configs/reference_xgboost.yaml
@@ -1,26 +1,29 @@
-
 experiment: my_run
 patchsize: 0
 memory_fraction: 0.5
 
 features:
-  - name: my continuous features
+  - name: my continuous features 2
     type: continuous
     files:
-      - path: configs/data/sirsam/er_depg.tif
-      - path: configs/data/sirsam/sagawet_b_sir.tif
       - path: configs/data/sirsam/dem_foc2.tif
       - path: configs/data/sirsam/outcrop_dis2.tif
-#      - path: configs/data/sirsam/k_15v5.tif
       - path: configs/data/sirsam/relief_apsect.tif
-#      - path: configs/data/sirsam/LATITUDE_GRID1.tif
-#      - path: configs/data/sirsam/LONGITUDE_GRID1.tif
-#      - directory: configs/data/weights/
     transforms:
       - identity
     #      - whiten:
     #          keep_fraction: 0.98
-    imputation: none
+    imputation: mean
+  - name: my continuous features 1
+    type: continuous
+    files:
+      - path: configs/data/sirsam/er_depg.tif
+      - path: configs/data/sirsam/sagawet_b_sir.tif
+    transforms:
+      - identity
+    #      - whiten:
+    #          keep_fraction: 0.98
+    imputation: gauss
 
 preprocessing:
   imputation: none
@@ -125,7 +128,7 @@ prediction:
   outbands: 1
 
 validation:
-  #- feature_rank
+#  - feature_rank
   - parallel
   - shapley
 #  - permutation_importance
@@ -135,9 +138,8 @@ validation:
       random_seed: 1
 
 oos_validation:
-  file: configs/data/weights/Gamma_K_50.shp
-  property: K2O
+  file: configs/data/geochem_sites.shp
+  property: K_ppm_imp
 
 output:
   directory: ref_xgb/
-
diff --git a/configs/resampling.yaml b/configs/resampling.yaml
old mode 100755
new mode 100644
index 791831b0..c84cfc73
--- a/configs/resampling.yaml
+++ b/configs/resampling.yaml
@@ -2,8 +2,8 @@
 #  when undersample == true, we choose points with or without bootstrap
 
 targets:
-  file: configs/data/weights/Gamma_K_50.shp
-  property: K2O
+  file: configs/data/sirsam/out_resampled/geochem_sites.shp
+  property: K_ppm_imp
   resample:
     spatial:
       undersample: false
diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py
new file mode 100644
index 00000000..117f556c
--- /dev/null
+++ b/scripts/check_covariates.py
@@ -0,0 +1,61 @@
+import subprocess
+from joblib import delayed, Parallel
+from pathlib import Path
+import rasterio as rio
+from rasterio.io import DatasetReader
+import numpy as np
+import pandas as pd
+
+dir = "configs/data/sirsam"
+
+mask = Path(dir).joinpath("dem_foc2.tif")
+
+with rio.open(mask) as geotif:
+    mask_raster = geotif.read(1, masked=True)
+
+
+
+def _parallel_read(r: Path):
+    try:
+        with rio.open(r) as geotif:
+            raster: DatasetReader = geotif.read(1, masked=True)
+            m = geotif.meta
+            m['crs'] = m['crs'].to_string()
+            t = m.pop('transform')
+            m['pixsize_x'] = t[0]
+            m['top_left_x'] = t[2]
+            m['pixsize_y'] = -t[4]
+            m['top_left_y'] = t[5]
+            raster.mask = raster.mask | mask_raster.mask  # we are not interested in masked areas
+            # print(raster)
+            m['all_finite'] = np.all(np.isfinite(raster))
+            m['any_nan'] = np.any(np.isnan(raster))
+            m['any_large'] = np.any(np.abs(raster) > 1e10)
+            m['min'] = np.ma.min(raster)
+            m['mean'] = np.ma.mean(raster)
+            m['median'] = np.ma.median(raster)
+            m['max'] = np.ma.max(raster)
+            m['std'] = np.ma.std(raster)
+            m['skew'] = 3 * (m['mean'] - m['median']) / m['std']
+            # subprocess.run(f"gdalinfo {r.as_posix()} -stats", shell=True, capture_output=True)
+            # raster_attrs[r.stem] = m
+        return m
+    except Exception as e:
+        print(r)
+        print(e)
+        return [None] * 14
+
+
+rets = Parallel(
+    n_jobs=1,
+    verbose=100,
+)(delayed(_parallel_read)(r) for r in Path(dir).glob("**/*.tif"))
+
+import pickle
+with open("rets.pk", 'wb') as f:
+    pickle.dump(rets, f)
+
+raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("**/*.tif",), rets)}
+
+df = pd.DataFrame.from_dict(raster_attrs)
+df.to_csv("quality.csv")
diff --git a/scripts/dedupe_shape.py b/scripts/dedupe_shape.py
new file mode 100644
index 00000000..42c2b440
--- /dev/null
+++ b/scripts/dedupe_shape.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+from joblib import Parallel, delayed
+import numpy as np
+import pandas as pd
+import rasterio
+import geopandas as gpd
+
+
+def dedupe_raster(shp: Path, tif: Path, deduped_shp: Path):
+    """
+    :param shp: input shapefile with dense points
+    :param tif: sample tif to read resolution details
+    :param deduped_shp: output shapefile with one point per down-sampled raster resolution
+    :return:
+    """
+    print("====================================\n", f"deduping {shp.as_posix()}")
+    geom_cols = ['POINT_X', 'POINT_Y']
+    pts = gpd.read_file(shp)
+    for g in geom_cols:
+        if g in pts.columns:
+            pts = pts.drop(g, axis=1)
+    coords = np.array([(p.x, p.y) for p in pts.geometry])
+    geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+    pts = pts.merge(geom, left_index=True, right_index=True)
+
+    with rasterio.open(tif) as src:
+        # resample data to target shape
+        data = src.read(
+            out_shape=(
+                src.count,
+                int(src.height / downscale_factor),
+                int(src.width / downscale_factor)
+            ),
+            resampling=rasterio.enums.Resampling.bilinear
+        )
+        # scale image transform
+        transform = src.transform * src.transform.scale(
+            (src.width / data.shape[-1]),
+            (src.height / data.shape[-2])
+        )
+        pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
+
+    pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count'))
+    pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean()
+    pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
+
+    pts_deduped = gpd.GeoDataFrame(pts_deduped,
+                                   geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y']),
+                                   crs="EPSG:3577"   # Australian Albers
+                                   )
+    pts_deduped.to_file(deduped_shp.as_posix())
+    return pts_deduped
+
+
+if __name__ == '__main__':
+    shapefiles = Path("configs/data/")
+    downscale_factor = 6  # keep 1 point in a 6x6 cell
+
+    dem = Path('/home/my_dem.tif')
+    output_dir = Path('1in6')
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    # for s in shapefiles.glob("*.shp"):
+    #     deduped_shp = output_dir.joinpath(s.name)
+    #     dedupe_raster(shp=s, tif=dem, deduped_shp=deduped_shp)
+
+    Parallel(
+            n_jobs=-1,
+            verbose=100,
+        )(delayed(dedupe_raster)(s, dem, output_dir.joinpath(s.name)) for s in shapefiles.glob("geochem_sites.shp"))
diff --git a/scripts/dedupe_with_median_scratch.py b/scripts/dedupe_with_median_scratch.py
new file mode 100644
index 00000000..e0bbc3fb
--- /dev/null
+++ b/scripts/dedupe_with_median_scratch.py
@@ -0,0 +1,293 @@
+from typing import Dict
+import csv
+from collections import defaultdict
+from os import path
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import rasterio
+import geopandas as gpd
+from joblib import Parallel, delayed
+
+
+def read_list_file(list_path: str):
+    files = []
+    csvfile = path.abspath(list_path)
+    with open(csvfile, 'r') as f:
+        reader = csv.reader(f)
+        tifs = list(reader)
+        tifs = [f[0].strip() for f in tifs
+                if (len(f) > 0 and f[0].strip() and
+                    f[0].strip()[0] != '#')]
+    for f in tifs:
+        files.append(path.abspath(f))
+    return files
+
+
+def generate_key_val(gl, shorts):
+    eight_char_name = Path(gl).name[:8]
+    if eight_char_name not in shorts:
+        shorts[eight_char_name] += 1
+    else:
+        shorts[eight_char_name] += 1
+    return gl, eight_char_name + str(shorts[eight_char_name])
+
+
+# import IPython; IPython.embed(); import sys; sys.exit()
+
+
+# data_location = \
+#     Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/")
+# Read points from shapefile
+
+# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/")
+
+# geotifs = {
+#     "relief_radius4.tif": "relief4",
+#     "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred",
+#     "MvrtpLL_smooth.tif": "mrvtpll_s",
+#     "MvrtpLL_fin.tif": "mvrtpll_f",
+#     "mrvbf_9.tif": "mrvbf_9",
+#     "Rad2016K_Th.tif": "rad2016kth",
+#     "Thorium_2016.tif": "th_2016",
+#     "Mesozoic_older_raster_MEAN.tif": "meso_mean",
+#     "LOC_distance_to_coast.tif": "loc_dis",
+#     "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb",
+#     "water-85m_1.tif": "water_85m",
+#     "clim_RSM_albers.tif": "clim_rsm",
+#     "tpi_300.tif": "tpi_300",
+#     "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir",
+#     "si_geol1.tif": "si_geol1",
+#     "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay",
+#     "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr",
+#     "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl",
+#     "Gravity_land.tif": "gravity",
+#     "dem_fill.tif": "dem",
+#     "Clim_Prescott_LindaGregory.tif": "clim_linda",
+#     "slope_fill2.tif": "slopefill2",
+#     "clim_PTA_albers.tif": "clim_alber",
+#     "SagaWET9cell_M.tif": "sagawet",
+#     "ceno_euc_aust1.tif": "ceno_euc",
+#     "s2-dpca-85m_band1.tif": "s2_band1",
+#     "s2-dpca-85m_band2.tif": "s2_band2",
+#     "s2-dpca-85m_band3.tif": "s2_band3",
+#     "3dem_mag0_finn.tif": "3dem_mag0",
+#     "3dem_mag1_fin.tif": "3dem_mag1",
+#     "3dem_mag2.tif": "3dem_mag2",
+# }
+
+# geotifs = {
+#     "relief_apsect.tif": "relief4",
+#     "LATITUDE_GRID1.tif": "latitude",
+#     "LONGITUDE_GRID1.tif": "longitude",
+#     "er_depg.tif": "er_depg",
+#     "sagawet_b_sir.tif": "sagawet",
+#     "dem_foc2.tif": "dem_foc2",
+#     "outcrop_dis2.tif": "outcrop",
+#     "k_15v5.tif": "k_15v5",
+# }
+
+majors = [
+    "SIO2",
+    "TIO2",
+    "AL2O3",
+    "FE2O3TOT",
+    "FE2O3",
+    "FEO",
+    "MNO",
+    "MGO",
+    "CAO",
+    "NA2O",
+    "K2O",
+    "P2O5",
+]
+
+def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str],
+                             dedupe: bool = False,
+                             intersect_parallel: bool = False,
+                             pixel_count: bool = False,
+                             dedupe_with_median: bool = True
+                             ):
+    print("====================================\n", f"intersecting {shp.as_posix()}")
+pts = gpd.read_file(shp)
+coords = np.array([(p.x, p.y) for p in pts.geometry])
+geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+pts = pts.merge(geom, left_index=True, right_index=True)
+    if dedupe:
+        tif_name = list(geotifs.keys())[0]
+        tif = Path(tif_name)
+        # orig_cols = pts.columns
+        with rasterio.open(tif) as src:
+            # resample data to target shape
+            data = src.read(
+                out_shape=(
+                    src.count,
+                    int(src.height / downscale_factor),
+                    int(src.width / downscale_factor)
+                ),
+                resampling=rasterio.enums.Resampling.bilinear
+            )
+            # scale image transform
+            transform = src.transform * src.transform.scale(
+                (src.width / data.shape[-1]),
+                (src.height / data.shape[-2])
+            )
+            pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
+
+        # keep one only
+        # pts_deduped = pts.sort_values(
+        #     by=geom_cols, ascending=[True, True]
+        # ).groupby(by=rows_and_cols, as_index=False).first()[orig_cols]
+
+        # keep mean of repeated observations in a pixel
+        #
+        pts_count = pts.groupby(by=rows_and_cols, as_index=False).agg(px_count=('rows', 'count'))
+        # if dedupe_with_median:
+            # import IPython; IPython.embed(); import sys; sys.exit()
+
+rows_and_cols = ['rows', 'cols']
+
+pts_majors = pts[majors + rows_and_cols + geom_cols + ['geometry']]
+for m in majors:
+    pts[m] = pts[m].apply(lambda x: np.nan if x == 'NA' else float(x))
+pts_median = pts_majors[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).median()
+pts_deduped = pts_median.merge(pts_count, how='inner', on=rows_and_cols)
+rename_dict = {k: k + '_m' for k in majors}
+pts_deduped = pts_deduped.rename(mapper=rename_dict, axis=1)
+all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=rows_and_cols)
+for m in majors:
+    print(m)
+    all_pts_with_count_and_median[m + '_distance_from_median'] = \
+        abs(all_pts_with_count_and_median[m]- all_pts_with_count_and_median[m + '_m'])/ \
+        abs(all_pts_with_count_and_median[m + '_m'])
+
+def sum_distance_from_median(x):
+    s = np.nansum([v for k, v in x.items() if k.endswith('_distance_from_median')])
+    return s
+
+all_pts_with_count_and_median['median_sum'] = all_pts_with_count_and_median.apply(lambda x: sum_distance_from_median(
+    x), axis=1)
+
+all_pts_with_count_and_median_min_median_sum = \
+    all_pts_with_count_and_median.groupby(by=rows_and_cols, as_index=False).agg(
+        median_sum_min=('median_sum', 'min'),
+        median_sum_std=('median_sum', 'std')
+    )
+
+all_pts = all_pts_with_count_and_median.merge(all_pts_with_count_and_median_min_median_sum, how='left', on=['rows',
+                                                                                                            'cols'])
+
+
+# all_pts['median_sum_to_median_min_diff'] = all_pts['median_sum'] - all_pts['median_sum_min']
+
+# min_pts = all_pts.groupby(by=rows_and_cols, as_index=False).median_sum_to_median_min_diff.agg(
+#     ['min', 'std']
+# )
+
+final_pts = all_pts[all_pts.median_sum==all_pts.median_sum_min]
+final_pts_sorted = final_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True])
+all_pts_sorted = all_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True])
+
+# all_pts.merge(min_pts, how="inner", on=['rows', 'cols', "median_sum_to_median_min_diff"])
+
+# all_pts_majors_median_filtered = pts_count.merge(all_pts[all_pts.median_sum == all_pts.median_sum_min],
+#                                                  how='left',on=rows_and_cols)
+
+        else:  # dedupe with mean
+            pts_mean = pts[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).mean()
+            pts_deduped = pts_mean.merge(pts_count, how='inner', on=rows_and_cols)
+        # import IPython; IPython.embed(); import sys; sys.exit()
+        pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'],
+                                                                                pts_deduped['POINT_Y']))
+        # pts_deduped = pts.drop_duplicates(subset=rows_and_cols)[orig_cols]
+        coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry])
+    else:
+        pts_deduped = pts
+        coords_deduped = coords
+
+    # for k, v in geotifs.items():
+        # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+        # pts_deduped[v] = __extract_raster_points(coords_deduped, k)
+    if intersect_parallel:
+        pts_deduped__ = Parallel(
+                n_jobs=-1,
+                verbose=100,
+            )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys())
+
+        for i, col_name in enumerate(geotifs.values()):
+            pts_deduped[col_name] = pts_deduped__[i]
+    else:
+        for i, (k, v) in enumerate(geotifs.items()):
+            print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+            pts_deduped[v] = __extract_raster_points(coords_deduped, k)
+
+    # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
+    # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False)
+    return pts_deduped
+
+
+def __extract_raster_points(raster, coords_deduped):
+    with rasterio.open(Path(raster)) as src:
+        print(f"---- intersecting {raster}--------")
+        return [x[0] for x in src.sample(coords_deduped)]
+
+
+def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False,
+                               intersect_parallel: bool = False):
+    df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    output_dir.joinpath('clean')
+    out_shp = output_dir.joinpath(shp.name)
+    df3 = df2[list(geotifs.values())]
+    df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
+    df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
+    try:
+        if df5.shape[0]:
+            df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
+            print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
+            if write_dropped:
+                df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
+        else:
+            df2.to_file(out_shp.as_posix())
+            print(f"saved intersected shapefile at {out_shp.as_posix()}")
+            print(f"No points dropped and there for _cleaned.shp file is not createed'")
+            print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
+    except:
+        print(f"Check this shapefile {shp}")
+
+
+if __name__ == '__main__':
+    # local
+    output_dir = Path('out_resampled')
+    shapefile_location = Path("configs/data")
+    downscale_factor = 2  # keep 1 point in a 2x2 cell
+
+    geom_cols = ['POINT_X', 'POINT_Y']
+
+    covariastes_list = "configs/data/sirsam/covariates_list.txt"
+    geotifs_list = read_list_file(list_path=covariastes_list)
+    shorts = defaultdict(int)
+
+    geotifs = {}
+
+    for gl in geotifs_list:
+        k, v = generate_key_val(gl, shorts)
+        geotifs[k] = v
+
+    # check all required files are available on disc
+    for k, v in geotifs.items():
+        print(f"checking if {k} exists")
+        assert Path(k).exists()
+
+    print("Add under the 'intersected_features:' section in yaml")
+    print('='*100)
+    for k, v in geotifs.items():
+        print(f"\"{Path(k).name}\": \"{v}\"")
+
+    print('='*100)
+    for s in shapefile_location.glob("*.shp"):
+        intersect_sample_and_clean(s, True, True)
+    # rets = Parallel(
+    #         n_jobs=-1,
+    #         verbose=100,
+    #     )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp"))
diff --git a/scripts/interpolate_shapes.py b/scripts/interpolate_shapes.py
new file mode 100644
index 00000000..5140cb59
--- /dev/null
+++ b/scripts/interpolate_shapes.py
@@ -0,0 +1,68 @@
+from functools import partial
+from pathlib import Path
+from scipy import interpolate
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+from joblib import Parallel, delayed
+
+
+def interpolate_1d(row, thicknesses):
+    conductivities = row[conductivity_cols].values
+    f = interpolate.interp1d(thicknesses, conductivities, bounds_error=False,
+                             fill_value=(conductivities[0], conductivities[-1]), assume_sorted=True)
+    new_conductivities = f(new_thicknesses)
+    return new_conductivities
+
+
+def interpolate_shape(shp):
+    print(f"interpolating {shp}")
+    df = gpd.read_file(shp)
+    thicknesses = df.loc[0, thickness_cols].values.cumsum()
+    interpolate_1d_local = partial(interpolate_1d, thicknesses=thicknesses)
+    interpolated_conductivities = df.apply(
+        func=interpolate_1d_local,
+        axis=1,
+        raw=False,
+    )
+    df.loc[:, conductivity_cols] = np.vstack(interpolated_conductivities)
+    df.loc[:, thickness_cols] = np.tile(new_thicknesses, (df.shape[0], 1))
+    target_shp = output_dir.joinpath(Path(shp).name)
+    df.to_file(target_shp)
+    print(f"wrote interpolated shapefile {target_shp}")
+
+
+if __name__ == '__main__':
+
+    ref_df = gpd.read_file("/g/data/ge3/aem_sections/AEM_covariates/NT_1_albers.shp", rows=1)
+
+    shapes_to_convert = [
+        # "/g/data/ge3/data/AEM_shapes/AEM_ERC_ALL_interp_data_Albers_Ceno.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_1_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_1_2_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_1_3_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers_mod.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers_mod.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/East_C_3_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_1_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_2_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_3_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_4_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_5_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/frome_6_albers.shp",
+        "/g/data/ge3/data/AEM_shapes/Mundi_Mundi_albers.shp",
+    ]
+    output_dir = Path('interpolated')
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    conductivity_cols = [c for c in ref_df.columns if c.startswith('cond_')]
+    thickness_cols = [t for t in ref_df.columns if t.startswith('thick_')]
+    new_thicknesses = np.array(ref_df.loc[0, thickness_cols].values.cumsum(), dtype=np.float32)
+
+    rets = Parallel(
+        n_jobs=-1,
+        verbose=100,
+    )(delayed(interpolate_shape)(s) for s in shapes_to_convert)
\ No newline at end of file
diff --git a/scripts/intersect_raster_with_shape.py b/scripts/intersect_raster_with_shape.py
new file mode 100644
index 00000000..b25a21f9
--- /dev/null
+++ b/scripts/intersect_raster_with_shape.py
@@ -0,0 +1,62 @@
+# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318_std.tif
+# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif
+
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import rasterio as rio
+
+
+# with rasterio.open(tif) as src:
+#     # resample data to target shape
+#     data = src.read(
+#         out_shape=(
+#             src.count,
+#             int(src.height / downscale_factor),
+#             int(src.width / downscale_factor)
+#         ),
+#         resampling=rasterio.enums.Resampling.bilinear
+#     )
+#     # scale image transform
+#     transform = src.transform * src.transform.scale(
+#         (src.width / data.shape[-1]),
+#         (src.height / data.shape[-2])
+#     )
+#     pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
+
+
+def raster_points() -> gpd.GeoDataFrame:
+    with rio.Env():
+        with rio.open(
+                '/g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif') as src:
+            crs = src.crs
+            # create 1D coordinate arrays (coordinates of the pixel center)
+            xmin, ymax = np.around(src.xy(0.00, 0.00), 9)  # src.xy(0, 0)
+            xmax, ymin = np.around(src.xy(src.height - 1, src.width - 1), 9)  # src.xy(src.width-1, src.height-1)
+            x = np.linspace(xmin, xmax, src.width)
+            y = np.linspace(ymax, ymin, src.height)  # max -> min so coords are top -> bottom
+
+            # create 2D arrays
+            xs, ys = np.meshgrid(x, y)
+            zs = src.read(1)
+
+            # Apply NoData mask
+            mask = src.read_masks(1) > 0
+            xs, ys, zs = xs[mask], ys[mask], zs[mask]
+    data = {"X": pd.Series(xs.ravel()),
+            "Y": pd.Series(ys.ravel()),
+            "Z": pd.Series(zs.ravel())}
+    df = pd.DataFrame(data=data)
+    geometry = gpd.points_from_xy(df.X, df.Y)
+    gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
+    return gdf
+
+
+raster_gdf = raster_points()
+
+shape_file = ""
+shp_gdf = gpd.read_file(shape_file)
+gpd.overlay(raster_gdf, shp_gdf, how="intersection")
+print(raster_gdf.head())
+
diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py
index f1fa77f0..ea50d789 100644
--- a/scripts/intersect_rasters.py
+++ b/scripts/intersect_rasters.py
@@ -30,73 +30,22 @@ def generate_key_val(gl, shorts):
         shorts[eight_char_name] += 1
     else:
         shorts[eight_char_name] += 1
-    return Path(gl).name, eight_char_name + str(shorts[eight_char_name])
-
-
-# import IPython; IPython.embed(); import sys; sys.exit()
-
-
-# data_location = \
-#     Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/")
-# Read points from shapefile
-
-# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/")
-
-# geotifs = {
-#     "relief_radius4.tif": "relief4",
-#     "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred",
-#     "MvrtpLL_smooth.tif": "mrvtpll_s",
-#     "MvrtpLL_fin.tif": "mvrtpll_f",
-#     "mrvbf_9.tif": "mrvbf_9",
-#     "Rad2016K_Th.tif": "rad2016kth",
-#     "Thorium_2016.tif": "th_2016",
-#     "Mesozoic_older_raster_MEAN.tif": "meso_mean",
-#     "LOC_distance_to_coast.tif": "loc_dis",
-#     "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb",
-#     "water-85m_1.tif": "water_85m",
-#     "clim_RSM_albers.tif": "clim_rsm",
-#     "tpi_300.tif": "tpi_300",
-#     "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir",
-#     "si_geol1.tif": "si_geol1",
-#     "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay",
-#     "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr",
-#     "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl",
-#     "Gravity_land.tif": "gravity",
-#     "dem_fill.tif": "dem",
-#     "Clim_Prescott_LindaGregory.tif": "clim_linda",
-#     "slope_fill2.tif": "slopefill2",
-#     "clim_PTA_albers.tif": "clim_alber",
-#     "SagaWET9cell_M.tif": "sagawet",
-#     "ceno_euc_aust1.tif": "ceno_euc",
-#     "s2-dpca-85m_band1.tif": "s2_band1",
-#     "s2-dpca-85m_band2.tif": "s2_band2",
-#     "s2-dpca-85m_band3.tif": "s2_band3",
-#     "3dem_mag0_finn.tif": "3dem_mag0",
-#     "3dem_mag1_fin.tif": "3dem_mag1",
-#     "3dem_mag2.tif": "3dem_mag2",
-# }
-
-# geotifs = {
-#     "relief_apsect.tif": "relief4",
-#     "LATITUDE_GRID1.tif": "latitude",
-#     "LONGITUDE_GRID1.tif": "longitude",
-#     "er_depg.tif": "er_depg",
-#     "sagawet_b_sir.tif": "sagawet",
-#     "dem_foc2.tif": "dem_foc2",
-#     "outcrop_dis2.tif": "outcrop",
-#     "k_15v5.tif": "k_15v5",
-# }
-
-
-def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False):
+    return gl, eight_char_name + str(shorts[eight_char_name])
+
+
+def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False,
+                             intersect_parallel: bool = False):
     print("====================================\n", f"intersecting {shp.as_posix()}")
     pts = gpd.read_file(shp)
+    for g in geom_cols:
+        if g in pts.columns:
+            pts = pts.drop(g)
     coords = np.array([(p.x, p.y) for p in pts.geometry])
+    geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
+    pts = pts.merge(geom, left_index=True, right_index=True)
     if dedupe:
-        geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
-        pts = pts.merge(geom, left_index=True, right_index=True)
         tif_name = list(geotifs.keys())[0]
-        tif = data_location.joinpath(tif_name)
+        tif = Path(tif_name)
         orig_cols = pts.columns
         with rasterio.open(tif) as src:
             # resample data to target shape
@@ -124,40 +73,76 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         #
         pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count'))
         pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean()
-        # import IPython; IPython.embed(); import sys; sys.exit()
-        pts_deduped = pts_mean.merge(
-            pts_count, how='inner', on=['rows', 'cols']).merge(
-            pts[['rows', 'cols', 'geometry']], how='inner', on=['rows', 'cols']
-        )
+        pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])
+
+        pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'],
+                                                                                pts_deduped['POINT_Y']))
         # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols]
         coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry])
     else:
         pts_deduped = pts
         coords_deduped = coords
 
-    for k, v in geotifs.items():
-        print(f"adding {k} to output dataframe")
-        with rasterio.open(data_location.joinpath(k)) as src:
-            pts_deduped[v] = [x[0] for x in src.sample(coords_deduped)]
+    # for k, v in geotifs.items():
+        # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+        # pts_deduped[v] = __extract_raster_points(coords_deduped, k)
+    if intersect_parallel:
+        print("=====================Intersecting parallelly =======================")
+        pts_deduped__ = Parallel(
+                n_jobs=-1,
+                verbose=100,
+            )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys())
+
+        for i, col_name in enumerate(geotifs.values()):
+            pts_deduped[col_name] = pts_deduped__[i]
+    else:
+        for i, (k, v) in enumerate(geotifs.items()):
+            print(f"adding {i}/{len(geotifs)}: {k} to output dataframe")
+            pts_deduped[v] = __extract_raster_points(k, coords_deduped)
 
-    pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
-    output_dir = Path('out_resampled')
+    # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry)
+    # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False)
+    return pts_deduped
+
+
+def __extract_raster_points(raster: str, coords_deduped: np.ndarray):
+    with rasterio.open(Path(raster)) as src:
+        print(f"---- intersecting {raster}--------")
+        return [x[0] for x in src.sample(coords_deduped)]
+
+
+def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False,
+                               intersect_parallel: bool = False):
+    df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel)
     output_dir.mkdir(exist_ok=True, parents=True)
+    output_dir.joinpath('clean')
     out_shp = output_dir.joinpath(shp.name)
-    pts_deduped.to_file(out_shp.as_posix())
-    print(f"saved intersected shapefile at {out_shp.as_posix()}")
-    # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False)
-    return out_shp
+    df3 = df2[list(geotifs.values())]
+    finite_idices = ((np.isfinite(df3)).sum(axis=1) == len(geotifs)) & \
+                    ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))
+    df4 = df2.loc[finite_idices, :]
+    df5 = df2.loc[~finite_idices, :]
+    try:
+        if df5.shape[0]:
+            df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
+            print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
+            if write_dropped:
+                df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
+        else:
+            df2.to_file(out_shp.as_posix())
+            print(f"saved intersected shapefile at {out_shp.as_posix()}")
+            print(f"No points dropped and there for _cleaned.shp file is not createed'")
+            print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
+    except Exception as e:
+        print(e)
+        print(f"Check this shapefile {shp}")
 
 
 if __name__ == '__main__':
     # local
-    data_location = Path("configs/data/sirsam")
-    # tif_local = data_location.joinpath('LATITUDE_GRID1.tif')
+    output_dir = Path('out_resampled')
     shapefile_location = Path("configs/data")
-    shp = shapefile_location.joinpath('geochem_sites.shp')
-
-    downscale_factor = 2  # keep 1 point in a 2x2 cell
+    downscale_factor = 1  # keep 1 point in a 2x2 cell
 
     geom_cols = ['POINT_X', 'POINT_Y']
 
@@ -171,28 +156,20 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool =
         k, v = generate_key_val(gl, shorts)
         geotifs[k] = v
 
-    for k, v in geotifs.items():
-        print(f"\"{k}\": \"{v}\"")
-
     # check all required files are available on disc
     for k, v in geotifs.items():
         print(f"checking if {k} exists")
-        assert data_location.joinpath(k).exists()
+        assert Path(k).exists()
 
-    out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=True)
-    df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix())
-    df3 = df2[list(geotifs.values())]
-    df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :]
-    df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :]
-
-    df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp'))
-    print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}")
-    if df5.shape[0]:
-        df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp'))
-    else:
-        print(f"No points dropped and there for _cleaned_dropped.shp file is not created")
-
-# rets = Parallel(
-    #     n_jobs=-1,
-    #     verbose=100,
-    # )(delayed(intersect_and_sample_shp)(s) for s in shapefile_location.glob("*.shp"))
+    print("Add under the 'intersected_features:' section in yaml")
+    print('='*100)
+    for k, v in geotifs.items():
+        print(f"\"{Path(k).name}\": \"{v}\"")
+
+    print('='*100)
+    for s in shapefile_location.glob("geochem_sites.shp"):
+        intersect_sample_and_clean(s, dedupe=True, write_dropped=True, intersect_parallel=True)
+    # rets = Parallel(
+    #         n_jobs=-1,
+    #         verbose=100,
+    #     )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp"))
diff --git a/scripts/plot_true_vs_prediction.py b/scripts/plot_true_vs_prediction.py
index 819ad888..f7f4902a 100644
--- a/scripts/plot_true_vs_prediction.py
+++ b/scripts/plot_true_vs_prediction.py
@@ -99,3 +99,25 @@ def plot_quantiles(line):
 plt.show()
 
 
+
+
+xx = line.d
+y_test = line.Prediction
+y_lower = line['Lower_quan']
+y_upper = line['Upper_quan']
+plt.plot(xx, line.y_true, "b.", markersize=10, label="Observed log10(conductivity)")
+plt.plot(xx, line.out, "g", markersize=10, linestyle='-',
+         label="Kriged log10(conductivity)")
+plt.plot(xx, y_test, color='orange', linestyle='-',
+         # marker='o',
+         label="Predicted median")
+plt.plot(xx, y_upper, "k-")
+plt.plot(xx, y_lower, "k-")
+plt.fill_between(
+    # (95362.213934-line.d).ravel()
+    (line.d).ravel(),
+    y_lower, y_upper, alpha=0.4, label='Predicted 90% interval'
+)
+plt.legend(loc='upper left')
+plt.ylabel('log10(conductivity)')
+plt.xlabel('Distance along flight line (meters)')
diff --git a/scripts/smooth_elevation.py b/scripts/smooth_elevation.py
new file mode 100644
index 00000000..32dccbda
--- /dev/null
+++ b/scripts/smooth_elevation.py
@@ -0,0 +1,64 @@
+from pathlib import Path
+import numpy as np
+from scipy.interpolate import interp2d, RegularGridInterpolator
+import rasterio as rio
+from rasterio.io import DatasetReader
+smooth_dem =  Path("/home/sudipta/Documents/nci/smooth_dem")
+elevation = smooth_dem.joinpath("elevation_small.tif")
+ceno = smooth_dem.joinpath("ceno_small.tif")
+relief = smooth_dem.joinpath("relief_small.tif")
+
+with rio.open(elevation) as geotif:
+    profile: rio.profiles.Profile = geotif.profile
+    ele: DatasetReader = geotif.read(1, masked=True)
+
+with rio.open(relief) as geotif:
+    rel: DatasetReader = geotif.read(1, masked=True)
+
+with rio.open(ceno) as geotif:
+    cen: DatasetReader = geotif.read(1, masked=True)
+
+ele_mod = np.ma.array(data=ele.data, mask=(ele.mask | ((cen.data > 0.01) & (rel.data > 2.3))), fill_value=np.nan)
+ele_mod.data[ele_mod.mask] = np.nan
+# ele_mod.mask = ele.mask | ((cen.data > 0.01) & (rel.data > 0.01))
+
+
+def points_removed_interp():
+    z = ele_mod
+    x, y = np.mgrid[0:z.shape[0], 0:z.shape[1]]
+    x1 = x[~z.mask]
+    y1 = y[~z.mask]
+    z1 = z[~z.mask]
+    print(z1.shape)
+    return interp2d(x1, y1, z1, kind="linear")(np.arange(z.shape[0]), np.arange(z.shape[1]))
+
+
+def recatangular_interp():
+    z = ele_mod.filled(np.nan)
+    x = np.array(range(ele_mod.shape[0]))
+    y = np.array(range(ele_mod.shape[1]))
+    zinterp = RegularGridInterpolator((x, y), z, method="linear")
+    X2, Y2 = np.meshgrid(x, y)
+    newpoints = np.array((X2, Y2)).T
+
+    # actual interpolation
+    z2 = zinterp(newpoints)
+    z2_masked = np.ma.array(z2, mask=np.isnan(z2))
+    return z2_masked
+
+
+profile.update(compress='lzw')
+
+with rio.open(smooth_dem.joinpath("smooth_elevation_small_masked_ref_2p3.tif"), mode="w", **profile,) as update_dataset:
+    # out = recatangular_interp()
+    # out = points_removed_interp()
+    print(profile)
+    update_dataset.write(ele_mod, 1)
+
+
+# import IPython; IPython.embed(); import sys; sys.exit()
+# flat_ele = interp2d(x1, y1, z1)
+#    (np.arange(ele_mod.shape[0]), np.arange(ele_mod.shape[1]))
+
+
+
diff --git a/setup.py b/setup.py
index 068891c6..b14d4aba 100644
--- a/setup.py
+++ b/setup.py
@@ -92,7 +92,7 @@ def run(self):
         'revrand >= 0.9.10',
         'mpi4py==3.1.3',
         'scipy==1.6.2',
-        'scikit-learn~=0.22.2',
+        'scikit-learn==0.22.2.post1',
         'scikit-image==0.19.1',
         'scikit-optimize == 0.8.1',
         'wheel >= 0.29.0',
diff --git a/uncoverml/config.py b/uncoverml/config.py
index 9abee74d..f7a67ce4 100644
--- a/uncoverml/config.py
+++ b/uncoverml/config.py
@@ -164,15 +164,20 @@ def __init__(self, yaml_file):
         self.krige = self.algorithm == 'krige'
 
         if 'prediction' in s:
-            self.quantiles = s['prediction']['quantiles']
-            self.geotif_options = s['prediction']['geotif'] if 'geotif' in \
-                s['prediction'] else {}
+            prediction = s["prediction"]
+            self.quantiles = prediction['quantiles']
+            self.geotif_options = prediction['geotif'] if 'geotif' in prediction else {}
             self.outbands = None
-            if 'outbands' in s['prediction']:
-                self.outbands = s['prediction']['outbands']
-            self.thumbnails = s['prediction']['thumbnails'] \
+            if 'outbands' in prediction:
+                self.outbands = prediction['outbands']
+            self.thumbnails = prediction['thumbnails'] \
                 if 'thumbnails' in s['prediction'] else 10
 
+            self.prediction_template = prediction["prediction_template"] \
+                if "prediction_template" in prediction else None
+
+        self.is_prediction = False
+
         if 'features' in s:
             self.pickle = any(True for d in s['features'] if d['type'] == 'pickle')
         else:
@@ -292,6 +297,7 @@ def __init__(self, yaml_file):
                 self.split_oos_fraction = s['targets']['split']['oos_fraction']
                 self.train_shapefile = Path(self.output_dir).joinpath(s['output']['train_shapefile'])
                 self.oos_shapefile = Path(self.output_dir).joinpath(s['output']['oos_shapefile'])
+            self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp')
 
         self.mask = None
         if 'mask' in s:
@@ -303,6 +309,10 @@ def __init__(self, yaml_file):
             preprocessing_transforms = s['preprocessing']['transforms']
             if 'n_components' in preprocessing_transforms[0]['whiten']:
                 self.n_components = preprocessing_transforms[0]['whiten']['n_components']
+            elif 'keep_fraction' in preprocessing_transforms[0]['whiten']:
+                self.keep_fraction = preprocessing_transforms[0]['whiten']['keep_fraction']
+            elif 'variation_fraction' in preprocessing_transforms[0]['whiten']:
+                self.variation_fraction = preprocessing_transforms[0]['whiten']['variation_fraction']
             else:
                 self.n_components = None
             if 'geotif' not in s['pca']:
@@ -388,14 +398,12 @@ def __init__(self, yaml_file):
             else self.name + ('.cluster' if self.clustering else '.model')
 
         self.model_file = Path(self.output_dir).joinpath(output_model)
-        if self.optimised_model:
-            self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp')
-            self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv')
-            self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv')
-            self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json")
-            self.optimised_model_file = Path(self.output_dir).joinpath(self.name + "_optimised.model")
-            self.outfile_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json")
-            self.optimised_model_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json")
+        self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv')
+        self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv')
+        self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json")
+        self.optimised_model_file = Path(self.output_dir).joinpath(self.name + "_optimised.model")
+        self.outfile_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json")
+        self.optimised_model_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json")
 
 
 class ConfigException(Exception):
diff --git a/uncoverml/features.py b/uncoverml/features.py
index bcbe4880..161e3804 100644
--- a/uncoverml/features.py
+++ b/uncoverml/features.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Optional
 from collections import OrderedDict
 import numpy as np
 import pickle
@@ -11,14 +12,16 @@
 from uncoverml import patch
 from uncoverml import transforms
 from uncoverml.config import Config
+from uncoverml.geoio import RasterioImageSource
 
 log = logging.getLogger(__name__)
 
 
-def extract_subchunks(image_source, subchunk_index, n_subchunks, patchsize):
+def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize,
+                      template_source: Optional[RasterioImageSource] = None):
     equiv_chunks = n_subchunks * mpiops.chunks
     equiv_chunk_index = mpiops.chunks*subchunk_index + mpiops.chunk_index
-    image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize)
+    image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize, template_source)
     x = patch.all_patches(image, patchsize)
     return x
 
@@ -87,7 +90,9 @@ def transform_features(feature_sets, transform_sets, final_transform, config):
         if mpiops.chunk_index == 0 and config.pickle:
             log.info('Saving featurevec for reuse')
             pickle.dump(feature_vec, open(config.featurevec, 'wb'))
-
+    if mpiops.chunk_index == 0:
+        for i, f in enumerate(feature_sets[0].keys()):
+            log.debug(f"Using feature num {i} from: {f}")
     x = np.ma.concatenate(transformed_vectors, axis=1)
     if config.cubist or config.multicubist or config.krige:
         log.warning("{}: Ignoring preprocessing "
@@ -170,9 +175,15 @@ def cull_all_null_rows(feature_sets):
 
     bool_transformed_vectors = np.concatenate([t.mask for t in
                                                transformed_vectors], axis=1)
-    covaraiates = bool_transformed_vectors.shape[1]
-    rows_to_keep = np.sum(bool_transformed_vectors, axis=1) != covaraiates
-    return rows_to_keep
+    data_transformed_vectors = np.concatenate([t.data for t in
+                                               transformed_vectors], axis=1)
+    num_covariates = bool_transformed_vectors.shape[1]
+
+    rows_with_at_least_one_cov_unmasked = np.sum(bool_transformed_vectors, axis=1) != num_covariates
+    # good rows are any covariate unmasked and all finite covariates
+    rows_with_all_finite_covariates = np.isfinite(data_transformed_vectors).sum(axis=1) == num_covariates
+    good_rows = rows_with_all_finite_covariates &  rows_with_at_least_one_cov_unmasked
+    return good_rows
 
 
 def gather_features(x, node=None):
diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py
index 38a9918c..33ee3b4e 100644
--- a/uncoverml/geoio.py
+++ b/uncoverml/geoio.py
@@ -1,4 +1,5 @@
 from __future__ import division
+from typing import Optional
 import joblib
 import os.path
 from pathlib import Path
@@ -82,7 +83,9 @@ def __init__(self, filename):
 
         self.filename = filename
         assert os.path.isfile(filename), '{} does not exist'.format(filename)
+
         with rasterio.open(self.filename, 'r') as geotiff:
+
             self._full_res = (geotiff.width, geotiff.height, geotiff.count)
             self._nodata_value = geotiff.meta['nodata']
             # we don't support different channels with different dtypes
@@ -284,14 +287,17 @@ def load_targets(shapefile, targetfield, conf: Config):
     return targets
 
 
-def get_image_spec(model, config):
+def get_image_spec(model, config: Config):
     # temp workaround, we should have an image spec to check against
     nchannels = len(model.get_predict_tags())
     return get_image_spec_from_nchannels(nchannels, config)
 
 
-def get_image_spec_from_nchannels(nchannels, config):
-    imagelike = config.feature_sets[0].files[0]
+def get_image_spec_from_nchannels(nchannels, config: Config):
+    if config.prediction_template and config.is_prediction:
+        imagelike = Path(config.prediction_template).absolute()
+    else:
+        imagelike = config.feature_sets[0].files[0]
     template_image = image.Image(RasterioImageSource(imagelike))
     eff_shape = template_image.patched_shape(config.patchsize) + (nchannels,)
     eff_bbox = template_image.patched_bbox(config.patchsize)
@@ -425,7 +431,7 @@ def output_thumbnails(self, ratio=10):
             resample(f, output_tif=thumbnail, ratio=ratio)
 
 
-def feature_names(config):
+def feature_names(config: Config):
 
     results = []
     for s in config.feature_sets:
@@ -438,9 +444,12 @@ def feature_names(config):
     return results
 
 
-def _iterate_sources(f, config):
+def _iterate_sources(f, config: Config):
 
     results = []
+    template_tif = config.prediction_template if config.is_prediction else None
+    if config.is_prediction:
+        log.info(f"Using prediction template {config.prediction_template}")
     for s in config.feature_sets:
         extracted_chunks = {}
         for tif in s.files:
@@ -448,18 +457,7 @@ def _iterate_sources(f, config):
             name = os.path.abspath(tif)
             image_source = RasterioImageSource(tif)
             x = f(image_source)
-            # TODO this may hurt performance. Consider removal
-            if type(x) is np.ma.MaskedArray:
-                count = mpiops.count(x)
-                # if not np.all(count > 0):
-                #     s = ("{} has no data in at least one band.".format(name) +
-                #          " Valid_pixel_count: {}".format(count))
-                #     raise ValueError(s)
-                missing_percent = missing_percentage(x)
-                t_missing = mpiops.comm.allreduce(
-                    missing_percent) / mpiops.chunks
-                log.info("{}: {}px {:2.2f}% missing".format(
-                    name, count, t_missing))
+            log_missing_percentage(name, x)
             extracted_chunks[name] = x
         extracted_chunks = OrderedDict(sorted(
             extracted_chunks.items(), key=lambda t: t[0]))
@@ -468,6 +466,21 @@ def _iterate_sources(f, config):
     return results
 
 
+def log_missing_percentage(name, x):
+    # TODO this may hurt performance. Consider removal
+    if type(x) is np.ma.MaskedArray:
+        count = mpiops.count(x)
+        # if not np.all(count > 0):
+        #     s = ("{} has no data in at least one band.".format(name) +
+        #          " Valid_pixel_count: {}".format(count))
+        #     raise ValueError(s)
+        missing_percent = missing_percentage(x)
+        t_missing = mpiops.comm.allreduce(
+            missing_percent) / mpiops.chunks
+        log.info("{}: {}px {:2.2f}% missing".format(
+            name, count, t_missing))
+
+
 def image_resolutions(config):
     def f(image_source):
         r = image_source._full_res
@@ -477,10 +490,16 @@ def f(image_source):
     return result
 
 
-def image_subchunks(subchunk_index, config):
+def image_subchunks(subchunk_index, config: Config):
+    """This is used in prediction only"""
 
-    def f(image_source):
-        r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize)
+    def f(image_source: RasterioImageSource):
+        if config.is_prediction and config.prediction_template is not None:
+            template_source = RasterioImageSource(config.prediction_template)
+        else:
+            template_source = None
+        r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize,
+                                       template_source=template_source)
         return r
     result = _iterate_sources(f, config)
     return result
@@ -538,8 +557,9 @@ def f(image_source):
                                        patchsize=config.patchsize)
         if frac < 1.0:
             np.random.seed(1)
-            r = r[np.random.rand(r.shape[0]) < frac]
-        return r
+            rr = r[np.random.rand(r.shape[0]) < frac]
+            log.info(f"sampled {rr.shape[0]} from max possible {r.shape[0]}")
+        return rr
     result = _iterate_sources(f, config)
     return result
 
diff --git a/uncoverml/image.py b/uncoverml/image.py
index 395c23c9..0f4494d4 100644
--- a/uncoverml/image.py
+++ b/uncoverml/image.py
@@ -1,7 +1,9 @@
+from typing import Optional
 import numpy as np
 import logging
 
 from affine import Affine
+from uncoverml.geoio import RasterioImageSource
 
 log = logging.getLogger(__name__)
 
@@ -27,7 +29,10 @@ def construct_splits(npixels, nchunks, overlap=0):
 
 
 class Image:
-    def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0):
+    def __init__(self, source: RasterioImageSource,
+                 chunk_idx=0, nchunks=1, overlap=0,
+                 t_source: Optional[RasterioImageSource] = None,
+                 ):
         assert chunk_idx >= 0 and chunk_idx < nchunks
 
         if nchunks == 1 and overlap != 0:
@@ -48,6 +53,10 @@ def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0):
         self.pixsize_x = source.pixsize_x
         self.pixsize_y = source.pixsize_y
         self.crs = source.crs
+        log.debug(f"Image full resolution : {source.full_resolution}")
+        log.debug(f"Image origin longitude : {source.origin_longitude}")
+        log.debug(f"Image origin latitude : {source.origin_latitude}")
+
         assert self.pixsize_x > 0
         assert self.pixsize_y > 0
 
@@ -64,21 +73,29 @@ def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0):
         self._pix_y_to_coords = dict(zip(pix_y, coords_y))
 
         # exclusive y range of this chunk in full image
-        ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx]
-        self._offset = np.array([0, ymin], dtype=int)
-        # exclusive x range of this chunk (same for all chunks)
-        xmin, xmax = 0, self._full_res[0]
+        if t_source:
+            self._t_full_res = t_source.full_resolution
+            self._t_start_lon = t_source.origin_longitude
+            self._t_start_lat = t_source.origin_latitude
+            self._t_pixsize_x = t_source.pixsize_x
+            self._t_pixsize_y = t_source.pixsize_y
+            tymin, tymax = construct_splits(self._t_full_res[1], nchunks, overlap)[chunk_idx]
+            t_x_offset = np.searchsorted(self._coords_x, self._t_start_lon)
+            t_y_offset = np.searchsorted(self._coords_y, self._t_start_lat)
+            xmin, xmax = t_x_offset, t_x_offset + self._t_full_res[0]
+            ymin, ymax = t_y_offset + tymin, t_y_offset + tymax
+            self.resolution = (xmax-xmin, ymax - ymin, self._t_full_res[2])
+        else:
+            # exclusive x range of this chunk (same for all chunks)
+            xmin, xmax = 0, self._full_res[0]
+            ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx]
+            self.resolution = (xmax - xmin, ymax - ymin, self._full_res[2])
 
+        self._offset = np.array([xmin, ymin], dtype=int)
         assert(xmin < xmax)
         assert(ymin < ymax)
 
-        # get resolution of this chunk
-        xres = self._full_res[0]
-        yres = ymax - ymin
-
         # Calculate the new values for resolution and bounding box
-        self.resolution = (xres, yres, self._full_res[2])
-
         start_bound_x, start_bound_y = self._global_pix2lonlat(
             np.array([[xmin, ymin]]))[0]
         # one past the last pixel
diff --git a/uncoverml/mpiops.py b/uncoverml/mpiops.py
index ad17814f..5c8e37a4 100644
--- a/uncoverml/mpiops.py
+++ b/uncoverml/mpiops.py
@@ -146,7 +146,11 @@ def outer(x):
     out = comm.allreduce(x_outer_local)
     still_masked = np.ma.count_masked(out)
     if still_masked != 0:
-        log.info('Reported out: ' + ', '.join([str(s) for s in out]))
+        log.info("=========still_masked =================")
+        if chunk_index == 0:
+            for s in out:
+                log.info(s)
+        # log.info('Reported out: ' + ', '.join([str(s) for s in out]))
         raise ValueError("Can't compute outer product:"
                          " completely missing columns!")
     if hasattr(out, 'mask'):
@@ -161,6 +165,7 @@ def covariance(x):
 
 
 def eigen_decomposition(x):
+    log.info(f"Eigenvalue matrix shape in process {chunk_index}: {x.shape}")
     eigvals, eigvecs = np.linalg.eigh(covariance(x))
     return eigvals, eigvecs
 
diff --git a/uncoverml/patch.py b/uncoverml/patch.py
index 028cce5d..e7e7f4df 100644
--- a/uncoverml/patch.py
+++ b/uncoverml/patch.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import skimage
+from uncoverml.image import Image
 
 
 log = logging.getLogger(__name__)
@@ -80,7 +81,7 @@ def point_patches(image, pwidth, points):
     return output
 
 
-def _image_to_data(image):
+def _image_to_data(image: Image):
     """
     breaks up an image object into arrays suitable for sending to the
     patching functions
@@ -92,7 +93,7 @@ def _image_to_data(image):
     return data, mask, data_dtype
 
 
-def all_patches(image, patchsize):
+def all_patches(image: Image, patchsize):
     data, mask, data_dtype = _image_to_data(image)
     patches = grid_patches(data, patchsize)
     patch_mask = grid_patches(mask, patchsize)
diff --git a/uncoverml/predict.py b/uncoverml/predict.py
index df06747a..1cc8d529 100644
--- a/uncoverml/predict.py
+++ b/uncoverml/predict.py
@@ -13,6 +13,7 @@
 from uncoverml.krige import krig_dict
 from uncoverml import transforms
 from uncoverml.config import Config
+from uncoverml.hyopt import NpEncoder
 
 log = logging.getLogger(__name__)
 float32finfo = np.finfo(dtype=np.float32)
@@ -128,8 +129,7 @@ def _fix_for_corrupt_data(x, feature_names):
         x.data[min_mask] = float32finfo.min
         x.data[max_mask] = float32finfo.max
         x.mask += min_mask + max_mask
-        problem_feature_names = \
-            list(compress(feature_names, ~isfinite.all(axis=0)))
+        problem_feature_names = list(compress(feature_names, ~isfinite.all(axis=0)))
 
         print("Warning: Float64 data was truncated to float32 max/min values."
               "Check your covariates for possible nodatavalue, datatype, "
@@ -144,31 +144,22 @@ def _get_data(subchunk, config):
 
     # NOTE: This returns an *untransformed* x,
     # which is ok as we just need dummies here
-    if config.mask:
-        mask_x = _mask(subchunk, config)
-        all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
-        if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
-            x = np.ma.zeros((mask_x.shape[0], len(features_names)),
-                            dtype=np.bool)
-            x.mask = True
-            log.info('Partition {} covariates are not loaded as '
-                     'the partition is entirely masked.'.format(subchunk + 1))
-            return x, features_names
+    # if config.mask:
+    #     mask_x = _mask(subchunk, config)
+    #     all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
+    #     if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
+    #         x = np.ma.zeros((mask_x.shape[0], len(features_names)),
+    #                         dtype=np.bool)
+    #         x.mask = True
+    #         log.info('Partition {} covariates are not loaded as '
+    #                  'the partition is entirely masked.'.format(subchunk + 1))
+    #         return x, features_names
 
     transform_sets = [k.transform_set for k in config.feature_sets]
     extracted_chunk_sets = geoio.image_subchunks(subchunk, config)
     log.info("Applying feature transforms")
     x = features.transform_features(extracted_chunk_sets, transform_sets, config.final_transform, config)[0]
 
-    # only check/correct float32 conversion for Ensemble models
-    if config.pca:
-        x = _fix_for_corrupt_data(x, features_names)
-        return _mask_rows(x, subchunk, config), features_names
-
-    if not config.clustering:
-        if (isinstance(modelmaps[config.algorithm](), BaseEnsemble) or
-                config.multirandomforest):
-            x = _fix_for_corrupt_data(x, features_names)
     return _mask_rows(x, subchunk, config), features_names
 
 
@@ -213,12 +204,12 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf
     log.info("Loaded {:2.4f}GB of image data".format(total_gb))
     alg = config.algorithm
     log.info("Predicting targets for {}.".format(alg))
-    if hasattr(config, 'quantiles'):
-        y_star = predict(x, model, interval=config.quantiles,
-                         lon_lat=_get_lon_lat(subchunk, config))
-    else:
-        y_star = predict(x, model, lon_lat=_get_lon_lat(subchunk, config))
-
+    try:
+        y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config))
+    except ValueError as v:
+        log.warning(v)
+        x = _fix_for_corrupt_data(x, feature_names)
+        y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config))
     if config.cluster and config.cluster_analysis:
         cluster_analysis(x, y_star, subchunk, config, feature_names)
     # cluster_analysis(x, y_star, subchunk, config, feature_names)
@@ -227,17 +218,20 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf
 
 def export_pca(subchunk, image_out: geoio.ImageWriter, config: Config):
     x, _ = _get_data(subchunk, config)
-    mpiops.run_once(export_pca_fractions, config)
     total_gb = mpiops.comm.allreduce(x.nbytes / 1e9)
     log.info("Loaded {:2.4f}GB of image data".format(total_gb))
     log.info("Extracting PCAs....")
-    image_out.write(np.flip(x, axis=1), subchunk)
+    # reverse x along columns which are eigen vectors so that the eigenvector corresponding to the largest eigenvalue
+    # is the first PC. This will hopefully remove some ambiguity
+    x = apply_masked(lambda _x: _x, np.flip(x, axis=1))
+    image_out.write(x, subchunk)
 
 
-def export_pca_fractions(config):
+def export_pca_fractions(config: Config):
     whiten_transform = config.final_transform.global_transforms[0]
     with open(config.pca_json, 'w') as output_file:
-        json.dump(whiten_transform.explained_ratio, output_file, sort_keys=True, indent=4)
+        json.dump(whiten_transform.__dict__, output_file, sort_keys=True, indent=4, cls=NpEncoder)
+    log.info(f"wrote pc contributions/whiten transform stats in {config.pca_json}")
 
 
 def cluster_analysis(x, y, partition_no, config, feature_names):
diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py
index ebea5f57..598e7b43 100644
--- a/uncoverml/scripts/uncoverml.py
+++ b/uncoverml/scripts/uncoverml.py
@@ -14,6 +14,10 @@
 from pathlib import Path
 import warnings
 
+warnings.filterwarnings(action='ignore', category=FutureWarning)
+warnings.filterwarnings(action='ignore', category=DeprecationWarning)
+warnings.filterwarnings(action='ignore', category=Warning)
+
 import click
 import numpy as np
 import matplotlib
@@ -39,9 +43,6 @@
 
 log = logging.getLogger(__name__)
 # warnings.showwarning = warn_with_traceback
-warnings.filterwarnings(action='ignore', category=FutureWarning)
-warnings.filterwarnings(action='ignore', category=DeprecationWarning)
-warnings.filterwarnings(action='ignore', category=Warning)
 
 
 @click.group()
@@ -339,16 +340,20 @@ def validate(pipeline_file, model_or_cluster_file, partitions):
               help='mask file used to limit prediction area')
 @click.option('-r', '--retain', type=int, default=None,
               help='mask values where to predict')
-def predict(model_or_cluster_file, partitions, mask, retain):
+@click.option('-t', '--prediction_template', type=click.Path(exists=True), default=None,
+              help='mask values where to predict')
+def predict(model_or_cluster_file, partitions, mask, retain, prediction_template):
 
     with open(model_or_cluster_file, 'rb') as f:
         state_dict = joblib.load(f)
 
     model = state_dict["model"]
     config = state_dict["config"]
+    config.is_prediction = True
     config.cluster = True if splitext(model_or_cluster_file)[1] == '.cluster' \
         else False
     config.mask = mask if mask else config.mask
+    config.prediction_template = prediction_template if prediction_template else config.prediction_template
     if config.mask:
         config.retain = retain if retain else config.retain
 
@@ -414,14 +419,20 @@ def predict(model_or_cluster_file, partitions, mask, retain):
 @click.argument('pipeline_file')
 @click.option('-p', '--partitions', type=int, default=1,
               help='divide each node\'s data into this many partitions')
+@click.option('-s', '--subsample_fraction', type=float, default=1.0,
+              help='only use this fraction of the data for learning classes')
 @click.option('-m', '--mask', type=str, default='',
               help='mask file used to limit prediction area')
 @click.option('-r', '--retain', type=int, default=None,
               help='mask values where to predict')
-def pca(pipeline_file, partitions, mask, retain):
+def pca(pipeline_file, partitions, subsample_fraction, mask, retain):
     config = ls.config.Config(pipeline_file)
+
+    __validate_pca_config(config)  # no other transforms other than whiten
     assert config.pca, "Not a pca analysis. Please include the pca block in your yaml!!"
     config.mask = mask if mask else config.mask
+
+    config.subsample_fraction = subsample_fraction
     if config.mask:
         config.retain = retain if retain else config.retain
 
@@ -432,24 +443,37 @@ def pca(pipeline_file, partitions, mask, retain):
 
     config.n_subchunks = partitions
     if config.n_subchunks > 1:
-        log.info("Memory contstraint forcing {} iterations "
-                 "through data".format(config.n_subchunks))
+        log.info("Memory contstraint forcing {} iterations through data".format(config.n_subchunks))
     else:
         log.info("Using memory aggressively: dividing all data between nodes")
 
-    image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(config.n_components, config)
-    tr_whiten = __validate_pca_config(config)
+    # Get the image chunks and their associated transforms
+    image_chunk_sets = ls.geoio.unsupervised_feature_sets(config)
+    transform_sets = [k.transform_set for k in config.feature_sets]
+
+    features, _ = ls.features.transform_features(image_chunk_sets,
+                                         transform_sets,
+                                         config.final_transform,
+                                         config)
+    features, _ = ls.features.remove_missing(features)
+    num_samples = np.sum(ls.mpiops.comm.gather(features.shape[0]))
+    log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of"
+             f" {num_samples} points from the rasters")
+    log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data")
+    ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config)
+    ls.mpiops.run_once(ls.predict.export_pca_fractions, config)
+
+    whiten_transform = config.final_transform.global_transforms[0]
+    image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(whiten_transform.keepdims, config)
     outfile_tif = config.name + "_pca"
 
     image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs,
                                      outfile_tif,
                                      config.n_subchunks, config.output_dir,
-                                     band_tags=[f'_pc_{n}' for n in range(1, config.n_components+1)],
+                                     band_tags=[f'pc_{n}' for n in range(1, whiten_transform.keepdims+1)],
                                      **config.geotif_options)
-
     for i in range(config.n_subchunks):
         log.info("starting to render partition {}".format(i+1))
-        # TODO: ideally want to take a random sample of each covariate and compute whiten stats
         ls.predict.export_pca(i, image_out, config)
 
     # explicitly close output rasters
@@ -459,6 +483,7 @@ def pca(pipeline_file, partitions, mask, retain):
 
 
 def __validate_pca_config(config):
+    # assert no other transforms other than whiten
     transform_sets = [k.transform_set for k in config.feature_sets]
     for t in transform_sets:
         assert len(t.image_transforms) == 0  # validation that there are no image or global transforms
diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py
index 2fc0e7b8..0017da2f 100644
--- a/uncoverml/transforms/linear.py
+++ b/uncoverml/transforms/linear.py
@@ -1,7 +1,8 @@
-
+import logging
 import numpy as np
 
 from uncoverml import mpiops
+log = logging.getLogger(__name__)
 
 
 class CentreTransform:
@@ -79,36 +80,43 @@ def __call__(self, *args):
 
 
 class WhitenTransform:
-    def __init__(self, keep_fraction=None, n_components=None):
+    def __init__(self, keep_fraction=None, n_components=None, variation_fraction=None):
         self.mean = None
         self.eigvals = None
         self.eigvecs = None
         self.keep_fraction = keep_fraction
         self.n_components = n_components
-        if n_components is None:
-            assert keep_fraction is not None
-        if keep_fraction is None:
-            assert n_components is not None
-        assert (keep_fraction is None) or (n_components is None)
-        self.explained_ratio = None
+        self.variation_fraction = variation_fraction
+        null_check = [x is None for x in [keep_fraction, n_components, variation_fraction]]
+        assert np.sum(list(null_check)) == 2, "only one of keep_fraction, n_components, " \
+                                              "variation_fraction can be non null"
+        self.explained_ratio = {}
 
     def __call__(self, x):
         x = x.astype(float)
         if self.mean is None or self.eigvals is None or self.eigvecs is None:
             self.mean = mpiops.mean(x)
+            num_points = np.sum(mpiops.comm.gather(x.shape[0]))
+            num_covariates = x.shape[1]
+            log.info(f"Matrix for eigenvalue decomposition has shape {num_points, num_covariates}")
             self.eigvals, self.eigvecs = mpiops.eigen_decomposition(x)
             ndims = x.shape[1]
             # make sure 1 <= keepdims <= ndims
-            if self.n_components is None:
+            self.explained_ratio = {
+                ndims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-ndims:])/np.sum(np.abs(
+                    self.eigvals)))
+            }
+
+            if self.n_components is None and self.variation_fraction is None:
                 self.keepdims = min(max(1, int(ndims * self.keep_fraction)), ndims)
+            elif self.n_components is None and self.keep_fraction is None:
+                explained_ratio_cum_sum = np.cumsum(list(self.explained_ratio.values())[::-1])
+                self.keepdims = np.searchsorted(explained_ratio_cum_sum > self.variation_fraction*100, True) + 1
+                log.info(f"Variation percentage {self.variation_fraction*100} requires only {self.keepdims} PCs")
             else:
                 self.keepdims = self.n_components
                 assert self.n_components < ndims, "More components demanded than features. Not possible! \n " \
                                                   "Please reduce n_components or increase the number of features"
-            self.explained_ratio = {
-                self.keepdims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-self.keepdims:])/np.sum(np.abs(
-                    self.eigvals)))
-            }
 
         mat = self.eigvecs[:, - self.keepdims:]
         vec = self.eigvals[np.newaxis, - self.keepdims:]
diff --git a/uncoverml/validate.py b/uncoverml/validate.py
index 54b82a86..0e42a946 100644
--- a/uncoverml/validate.py
+++ b/uncoverml/validate.py
@@ -374,14 +374,16 @@ def setup_validation_data(X, targets_all, cv_folds, random_state=None):
                                                 random_state=random_state)
 
     rows_with_at_least_one_masked = ~ np.any(X.mask, axis=1)
-    X = X[rows_with_at_least_one_masked, :]
-    w = w[rows_with_at_least_one_masked]
-    y = y[rows_with_at_least_one_masked]
-    lon_lat = lon_lat[rows_with_at_least_one_masked, :]
-    groups = groups[rows_with_at_least_one_masked]
+    finite_X = np.isfinite(X.data).sum(axis=1) == X.shape[1]
+    valid_rows = rows_with_at_least_one_masked & finite_X
+    X = X[valid_rows, :]
+    w = w[valid_rows]
+    y = y[valid_rows]
+    lon_lat = lon_lat[valid_rows, :]
+    groups = groups[valid_rows]
 
     for (f, v), a in zip(targets_all.fields.items(), arrays):
-        targets_all.fields[f] = a[rows_with_at_least_one_masked]
+        targets_all.fields[f] = a[valid_rows]
 
     if len(np.unique(groups)) >= cv_folds:
         log.info(f'Using GroupKFold with {cv_folds} folds')