-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/data-wrangling' into sudipta-merge
# Conflicts: # uncoverml/config.py # uncoverml/predict.py
- Loading branch information
Showing
26 changed files
with
1,005 additions
and
287 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import subprocess | ||
from joblib import delayed, Parallel | ||
from pathlib import Path | ||
import rasterio as rio | ||
from rasterio.io import DatasetReader | ||
import numpy as np | ||
import pandas as pd | ||
|
||
dir = "configs/data/sirsam" | ||
|
||
mask = Path(dir).joinpath("dem_foc2.tif") | ||
|
||
with rio.open(mask) as geotif: | ||
mask_raster = geotif.read(1, masked=True) | ||
|
||
|
||
|
||
def _parallel_read(r: Path): | ||
try: | ||
with rio.open(r) as geotif: | ||
raster: DatasetReader = geotif.read(1, masked=True) | ||
m = geotif.meta | ||
m['crs'] = m['crs'].to_string() | ||
t = m.pop('transform') | ||
m['pixsize_x'] = t[0] | ||
m['top_left_x'] = t[2] | ||
m['pixsize_y'] = -t[4] | ||
m['top_left_y'] = t[5] | ||
raster.mask = raster.mask | mask_raster.mask # we are not interested in masked areas | ||
# print(raster) | ||
m['all_finite'] = np.all(np.isfinite(raster)) | ||
m['any_nan'] = np.any(np.isnan(raster)) | ||
m['any_large'] = np.any(np.abs(raster) > 1e10) | ||
m['min'] = np.ma.min(raster) | ||
m['mean'] = np.ma.mean(raster) | ||
m['median'] = np.ma.median(raster) | ||
m['max'] = np.ma.max(raster) | ||
m['std'] = np.ma.std(raster) | ||
m['skew'] = 3 * (m['mean'] - m['median']) / m['std'] | ||
# subprocess.run(f"gdalinfo {r.as_posix()} -stats", shell=True, capture_output=True) | ||
# raster_attrs[r.stem] = m | ||
return m | ||
except Exception as e: | ||
print(r) | ||
print(e) | ||
return [None] * 14 | ||
|
||
|
||
rets = Parallel( | ||
n_jobs=1, | ||
verbose=100, | ||
)(delayed(_parallel_read)(r) for r in Path(dir).glob("**/*.tif")) | ||
|
||
import pickle | ||
with open("rets.pk", 'wb') as f: | ||
pickle.dump(rets, f) | ||
|
||
raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("**/*.tif",), rets)} | ||
|
||
df = pd.DataFrame.from_dict(raster_attrs) | ||
df.to_csv("quality.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from pathlib import Path | ||
from joblib import Parallel, delayed | ||
import numpy as np | ||
import pandas as pd | ||
import rasterio | ||
import geopandas as gpd | ||
|
||
|
||
def dedupe_raster(shp: Path, tif: Path, deduped_shp: Path): | ||
""" | ||
:param shp: input shapefile with dense points | ||
:param tif: sample tif to read resolution details | ||
:param deduped_shp: output shapefile with one point per down-sampled raster resolution | ||
:return: | ||
""" | ||
print("====================================\n", f"deduping {shp.as_posix()}") | ||
geom_cols = ['POINT_X', 'POINT_Y'] | ||
pts = gpd.read_file(shp) | ||
for g in geom_cols: | ||
if g in pts.columns: | ||
pts = pts.drop(g, axis=1) | ||
coords = np.array([(p.x, p.y) for p in pts.geometry]) | ||
geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) | ||
pts = pts.merge(geom, left_index=True, right_index=True) | ||
|
||
with rasterio.open(tif) as src: | ||
# resample data to target shape | ||
data = src.read( | ||
out_shape=( | ||
src.count, | ||
int(src.height / downscale_factor), | ||
int(src.width / downscale_factor) | ||
), | ||
resampling=rasterio.enums.Resampling.bilinear | ||
) | ||
# scale image transform | ||
transform = src.transform * src.transform.scale( | ||
(src.width / data.shape[-1]), | ||
(src.height / data.shape[-2]) | ||
) | ||
pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1]) | ||
|
||
pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count')) | ||
pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean() | ||
pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) | ||
|
||
pts_deduped = gpd.GeoDataFrame(pts_deduped, | ||
geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y']), | ||
crs="EPSG:3577" # Australian Albers | ||
) | ||
pts_deduped.to_file(deduped_shp.as_posix()) | ||
return pts_deduped | ||
|
||
|
||
if __name__ == '__main__': | ||
shapefiles = Path("configs/data/") | ||
downscale_factor = 6 # keep 1 point in a 6x6 cell | ||
|
||
dem = Path('/home/my_dem.tif') | ||
output_dir = Path('1in6') | ||
output_dir.mkdir(exist_ok=True, parents=True) | ||
|
||
# for s in shapefiles.glob("*.shp"): | ||
# deduped_shp = output_dir.joinpath(s.name) | ||
# dedupe_raster(shp=s, tif=dem, deduped_shp=deduped_shp) | ||
|
||
Parallel( | ||
n_jobs=-1, | ||
verbose=100, | ||
)(delayed(dedupe_raster)(s, dem, output_dir.joinpath(s.name)) for s in shapefiles.glob("geochem_sites.shp")) |
Oops, something went wrong.