Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/data-wrangling' into sudipta-merge
Browse files Browse the repository at this point in the history
# Conflicts:
#	uncoverml/config.py
#	uncoverml/predict.py
  • Loading branch information
basaks committed May 18, 2023
2 parents df3f699 + 4594183 commit bc440b6
Show file tree
Hide file tree
Showing 26 changed files with 1,005 additions and 287 deletions.
1 change: 0 additions & 1 deletion configs/demo_regression.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ mask:
# imputation options are gaus, nn, mean
preprocessing:
imputation: mean

transforms:
# - whiten:
# keep_fraction: 0.8
Expand Down
Empty file modified configs/group_and_split_shapefile.yaml
100755 → 100644
Empty file.
53 changes: 35 additions & 18 deletions configs/ref_rf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ features:
- path: configs/data/sirsam/k_15v5.tif
- path: configs/data/sirsam/relief_apsect.tif
transforms:
- standardise
- whiten:
keep_fraction: 0.8
# - standardise
# - whiten:
# keep_fraction: 0.8
imputation: none

preprocessing:
imputation: none
transforms:
- whiten:
keep_fraction: 0.8
# - whiten:
# keep_fraction: 0.8

targets:
file: configs/data/geochem_sites.shp
file: configs/data/geochem_sites_cropped.shp
property: K_ppm_imp
# group_targets:
# groups_eps: 0.09
Expand All @@ -39,22 +39,39 @@ learning:
random_state: 1
max_depth: 20
optimisation:
searchcv_params:
n_iter: 6
hyperopt_params:
max_evals: 5
step: 2
cv: 2
verbose: 1000
n_points: 3
n_jobs: 6
params_space:
'max_depth': Integer(1, 15)
'n_estimators': Integer(10, 100)
'max_features': Categorical(['auto', 'sqrt', 'log2'])
'min_samples_split': Integer(2, 50)
'min_samples_leaf': Integer(1, 50)
'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
verbose: true
random_state: 3
scoring: r2 # r2, neg_mean_absolute_error, etc..see note above
algo: bayes # bayes, or anneal
hp_params_space:
max_depth: randint('max_depth', 1, 15)
n_estimators: randint('n_estimators', 5, 25)
max_features: choice('max_features', ['auto', 'sqrt', 'log2'])
min_samples_split: randint('min_samples_split', 2, 50)
min_samples_leaf: randint('min_samples_leaf', 1, 50)
min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
max_leaf_nodes: randint('max_leaf_nodes', 10, 50)
# searchcv_params:
# n_iter: 6
# cv: 2
# verbose: 1000
# n_points: 3
# n_jobs: 6
# params_space:
# 'max_depth': Integer(1, 15)
# 'n_estimators': Integer(10, 100)
# 'max_features': Categorical(['auto', 'sqrt', 'log2'])
# 'min_samples_split': Integer(2, 50)
# 'min_samples_leaf': Integer(1, 50)
# 'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')


prediction:
prediction_template: configs/data/sirsam/dem_foc2.tif
quantiles: 0.95
outbands: 4

Expand Down
42 changes: 13 additions & 29 deletions configs/reference_gb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,17 @@ learning:
algorithm: gradientboost
arguments:
target_transform: identity
loss: 'ls'
max_depth: 20
learning_rate: 0.1
n_estimators: 100
subsample: 0.9
min_samples_split: 2
min_samples_leaf: 2
min_weight_fraction_leaf: 0.0
max_features: "auto"
alpha: 0.95
random_state: 3
# loss: 'ls'
# max_depth: 20
# learning_rate: 0.1
n_estimators: 200
# subsample: 0.9
# min_samples_split: 2
# min_samples_leaf: 2
# min_weight_fraction_leaf: 0.0
# max_features: "auto"
# alpha: 0.95
# random_state: 3
optimisation:
hyperopt_params:
max_evals: 5
Expand All @@ -89,22 +89,6 @@ learning:
min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5)
max_leaf_nodes: randint('max_leaf_nodes', 10, 50)

searchcv_params:
n_iter: 6
cv: 2
verbose: 1000
n_points: 3
n_jobs: 6
params_space:
'max_depth': Integer(1, 15)
'learning_rate': Real(10 ** -5, 10 ** 0, prior="log-uniform")
'n_estimators': Integer(10, 100)
'subsample': Real(0.01, 1.0, prior='uniform')
'max_features': Categorical(['auto', 'sqrt', 'log2'])
'min_samples_split': Integer(2, 50)
'min_samples_leaf': Integer(1, 50)
'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform')
'max_leaf_nodes': Integer(10, 50)

prediction:
# corner_coordinates:
Expand All @@ -114,10 +98,10 @@ prediction:
outbands: 1

validation:
#- feature_rank
- feature_rank
- parallel
- k-fold:
folds: 3
folds: 5
random_seed: 1

output:
Expand Down
12 changes: 10 additions & 2 deletions configs/reference_pca.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ memory_fraction: 0.5

features:
- name: my continuous features
type: continuous
type: ordinal
files:
- path: configs/data/sirsam/er_depg.tif
- path: configs/data/sirsam/sagawet_b_sir.tif
Expand All @@ -14,11 +14,19 @@ features:
- path: configs/data/sirsam/relief_apsect.tif
- path: configs/data/sirsam/LATITUDE_GRID1.tif
- path: configs/data/sirsam/LONGITUDE_GRID1.tif
imputation: nn
# nodes: 5000

preprocessing:
transforms:
- whiten:
n_components: 4
# n_components: 2
# keep_fraction: 0.2 # one of keep fraction or n_components or variation_fraction allowed
variation_fraction: 0.98

mask:
file: /path/to/GA_data/GA-cover2/mask/old_mask_test.tif
retain: 1

pca:
geotif:
Expand Down
28 changes: 15 additions & 13 deletions configs/reference_xgboost.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,29 @@

experiment: my_run
patchsize: 0
memory_fraction: 0.5

features:
- name: my continuous features
- name: my continuous features 2
type: continuous
files:
- path: configs/data/sirsam/er_depg.tif
- path: configs/data/sirsam/sagawet_b_sir.tif
- path: configs/data/sirsam/dem_foc2.tif
- path: configs/data/sirsam/outcrop_dis2.tif
# - path: configs/data/sirsam/k_15v5.tif
- path: configs/data/sirsam/relief_apsect.tif
# - path: configs/data/sirsam/LATITUDE_GRID1.tif
# - path: configs/data/sirsam/LONGITUDE_GRID1.tif
# - directory: configs/data/weights/
transforms:
- identity
# - whiten:
# keep_fraction: 0.98
imputation: none
imputation: mean
- name: my continuous features 1
type: continuous
files:
- path: configs/data/sirsam/er_depg.tif
- path: configs/data/sirsam/sagawet_b_sir.tif
transforms:
- identity
# - whiten:
# keep_fraction: 0.98
imputation: gauss

preprocessing:
imputation: none
Expand Down Expand Up @@ -125,7 +128,7 @@ prediction:
outbands: 1

validation:
#- feature_rank
# - feature_rank
- parallel
- shapley
# - permutation_importance
Expand All @@ -135,9 +138,8 @@ validation:
random_seed: 1

oos_validation:
file: configs/data/weights/Gamma_K_50.shp
property: K2O
file: configs/data/geochem_sites.shp
property: K_ppm_imp

output:
directory: ref_xgb/

4 changes: 2 additions & 2 deletions configs/resampling.yaml
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# when undersample == true, we choose points with or without bootstrap

targets:
file: configs/data/weights/Gamma_K_50.shp
property: K2O
file: configs/data/sirsam/out_resampled/geochem_sites.shp
property: K_ppm_imp
resample:
spatial:
undersample: false
Expand Down
61 changes: 61 additions & 0 deletions scripts/check_covariates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import subprocess
from joblib import delayed, Parallel
from pathlib import Path
import rasterio as rio
from rasterio.io import DatasetReader
import numpy as np
import pandas as pd

dir = "configs/data/sirsam"

mask = Path(dir).joinpath("dem_foc2.tif")

with rio.open(mask) as geotif:
mask_raster = geotif.read(1, masked=True)



def _parallel_read(r: Path):
try:
with rio.open(r) as geotif:
raster: DatasetReader = geotif.read(1, masked=True)
m = geotif.meta
m['crs'] = m['crs'].to_string()
t = m.pop('transform')
m['pixsize_x'] = t[0]
m['top_left_x'] = t[2]
m['pixsize_y'] = -t[4]
m['top_left_y'] = t[5]
raster.mask = raster.mask | mask_raster.mask # we are not interested in masked areas
# print(raster)
m['all_finite'] = np.all(np.isfinite(raster))
m['any_nan'] = np.any(np.isnan(raster))
m['any_large'] = np.any(np.abs(raster) > 1e10)
m['min'] = np.ma.min(raster)
m['mean'] = np.ma.mean(raster)
m['median'] = np.ma.median(raster)
m['max'] = np.ma.max(raster)
m['std'] = np.ma.std(raster)
m['skew'] = 3 * (m['mean'] - m['median']) / m['std']
# subprocess.run(f"gdalinfo {r.as_posix()} -stats", shell=True, capture_output=True)
# raster_attrs[r.stem] = m
return m
except Exception as e:
print(r)
print(e)
return [None] * 14


rets = Parallel(
n_jobs=1,
verbose=100,
)(delayed(_parallel_read)(r) for r in Path(dir).glob("**/*.tif"))

import pickle
with open("rets.pk", 'wb') as f:
pickle.dump(rets, f)

raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("**/*.tif",), rets)}

df = pd.DataFrame.from_dict(raster_attrs)
df.to_csv("quality.csv")
70 changes: 70 additions & 0 deletions scripts/dedupe_shape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from pathlib import Path
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
import rasterio
import geopandas as gpd


def dedupe_raster(shp: Path, tif: Path, deduped_shp: Path):
"""
:param shp: input shapefile with dense points
:param tif: sample tif to read resolution details
:param deduped_shp: output shapefile with one point per down-sampled raster resolution
:return:
"""
print("====================================\n", f"deduping {shp.as_posix()}")
geom_cols = ['POINT_X', 'POINT_Y']
pts = gpd.read_file(shp)
for g in geom_cols:
if g in pts.columns:
pts = pts.drop(g, axis=1)
coords = np.array([(p.x, p.y) for p in pts.geometry])
geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index)
pts = pts.merge(geom, left_index=True, right_index=True)

with rasterio.open(tif) as src:
# resample data to target shape
data = src.read(
out_shape=(
src.count,
int(src.height / downscale_factor),
int(src.width / downscale_factor)
),
resampling=rasterio.enums.Resampling.bilinear
)
# scale image transform
transform = src.transform * src.transform.scale(
(src.width / data.shape[-1]),
(src.height / data.shape[-2])
)
pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])

pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count'))
pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean()
pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols'])

pts_deduped = gpd.GeoDataFrame(pts_deduped,
geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y']),
crs="EPSG:3577" # Australian Albers
)
pts_deduped.to_file(deduped_shp.as_posix())
return pts_deduped


if __name__ == '__main__':
shapefiles = Path("configs/data/")
downscale_factor = 6 # keep 1 point in a 6x6 cell

dem = Path('/home/my_dem.tif')
output_dir = Path('1in6')
output_dir.mkdir(exist_ok=True, parents=True)

# for s in shapefiles.glob("*.shp"):
# deduped_shp = output_dir.joinpath(s.name)
# dedupe_raster(shp=s, tif=dem, deduped_shp=deduped_shp)

Parallel(
n_jobs=-1,
verbose=100,
)(delayed(dedupe_raster)(s, dem, output_dir.joinpath(s.name)) for s in shapefiles.glob("geochem_sites.shp"))

0 comments on commit bc440b6

Please sign in to comment.