diff --git a/configs/demo_regression.yaml b/configs/demo_regression.yaml index eda84ce0..c120e7aa 100644 --- a/configs/demo_regression.yaml +++ b/configs/demo_regression.yaml @@ -33,7 +33,6 @@ mask: # imputation options are gaus, nn, mean preprocessing: imputation: mean - transforms: # - whiten: # keep_fraction: 0.8 diff --git a/configs/group_and_split_shapefile.yaml b/configs/group_and_split_shapefile.yaml old mode 100755 new mode 100644 diff --git a/configs/ref_rf.yaml b/configs/ref_rf.yaml index abdb678f..bdfbf1c0 100644 --- a/configs/ref_rf.yaml +++ b/configs/ref_rf.yaml @@ -14,19 +14,19 @@ features: - path: configs/data/sirsam/k_15v5.tif - path: configs/data/sirsam/relief_apsect.tif transforms: - - standardise - - whiten: - keep_fraction: 0.8 +# - standardise +# - whiten: +# keep_fraction: 0.8 imputation: none preprocessing: imputation: none transforms: - - whiten: - keep_fraction: 0.8 +# - whiten: +# keep_fraction: 0.8 targets: - file: configs/data/geochem_sites.shp + file: configs/data/geochem_sites_cropped.shp property: K_ppm_imp # group_targets: # groups_eps: 0.09 @@ -39,22 +39,39 @@ learning: random_state: 1 max_depth: 20 optimisation: - searchcv_params: - n_iter: 6 + hyperopt_params: + max_evals: 5 + step: 2 cv: 2 - verbose: 1000 - n_points: 3 - n_jobs: 6 - params_space: - 'max_depth': Integer(1, 15) - 'n_estimators': Integer(10, 100) - 'max_features': Categorical(['auto', 'sqrt', 'log2']) - 'min_samples_split': Integer(2, 50) - 'min_samples_leaf': Integer(1, 50) - 'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform') + verbose: true + random_state: 3 + scoring: r2 # r2, neg_mean_absolute_error, etc..see note above + algo: bayes # bayes, or anneal + hp_params_space: + max_depth: randint('max_depth', 1, 15) + n_estimators: randint('n_estimators', 5, 25) + max_features: choice('max_features', ['auto', 'sqrt', 'log2']) + min_samples_split: randint('min_samples_split', 2, 50) + min_samples_leaf: randint('min_samples_leaf', 1, 50) + min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5) + max_leaf_nodes: randint('max_leaf_nodes', 10, 50) +# searchcv_params: +# n_iter: 6 +# cv: 2 +# verbose: 1000 +# n_points: 3 +# n_jobs: 6 +# params_space: +# 'max_depth': Integer(1, 15) +# 'n_estimators': Integer(10, 100) +# 'max_features': Categorical(['auto', 'sqrt', 'log2']) +# 'min_samples_split': Integer(2, 50) +# 'min_samples_leaf': Integer(1, 50) +# 'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform') prediction: + prediction_template: configs/data/sirsam/dem_foc2.tif quantiles: 0.95 outbands: 4 diff --git a/configs/reference_gb.yaml b/configs/reference_gb.yaml index 2c1717a8..c5f7f787 100644 --- a/configs/reference_gb.yaml +++ b/configs/reference_gb.yaml @@ -58,17 +58,17 @@ learning: algorithm: gradientboost arguments: target_transform: identity - loss: 'ls' - max_depth: 20 - learning_rate: 0.1 - n_estimators: 100 - subsample: 0.9 - min_samples_split: 2 - min_samples_leaf: 2 - min_weight_fraction_leaf: 0.0 - max_features: "auto" - alpha: 0.95 - random_state: 3 +# loss: 'ls' +# max_depth: 20 +# learning_rate: 0.1 + n_estimators: 200 +# subsample: 0.9 +# min_samples_split: 2 +# min_samples_leaf: 2 +# min_weight_fraction_leaf: 0.0 +# max_features: "auto" +# alpha: 0.95 +# random_state: 3 optimisation: hyperopt_params: max_evals: 5 @@ -89,22 +89,6 @@ learning: min_weight_fraction_leaf: uniform('min_weight_fraction_leaf', 0.0, 0.5) max_leaf_nodes: randint('max_leaf_nodes', 10, 50) - searchcv_params: - n_iter: 6 - cv: 2 - verbose: 1000 - n_points: 3 - n_jobs: 6 - params_space: - 'max_depth': Integer(1, 15) - 'learning_rate': Real(10 ** -5, 10 ** 0, prior="log-uniform") - 'n_estimators': Integer(10, 100) - 'subsample': Real(0.01, 1.0, prior='uniform') - 'max_features': Categorical(['auto', 'sqrt', 'log2']) - 'min_samples_split': Integer(2, 50) - 'min_samples_leaf': Integer(1, 50) - 'min_weight_fraction_leaf': Real(0.0, 0.5, prior='uniform') - 'max_leaf_nodes': Integer(10, 50) prediction: # corner_coordinates: @@ -114,10 +98,10 @@ prediction: outbands: 1 validation: - #- feature_rank + - feature_rank - parallel - k-fold: - folds: 3 + folds: 5 random_seed: 1 output: diff --git a/configs/reference_pca.yaml b/configs/reference_pca.yaml index aea2c3c9..46b76757 100644 --- a/configs/reference_pca.yaml +++ b/configs/reference_pca.yaml @@ -4,7 +4,7 @@ memory_fraction: 0.5 features: - name: my continuous features - type: continuous + type: ordinal files: - path: configs/data/sirsam/er_depg.tif - path: configs/data/sirsam/sagawet_b_sir.tif @@ -14,11 +14,19 @@ features: - path: configs/data/sirsam/relief_apsect.tif - path: configs/data/sirsam/LATITUDE_GRID1.tif - path: configs/data/sirsam/LONGITUDE_GRID1.tif + imputation: nn +# nodes: 5000 preprocessing: transforms: - whiten: - n_components: 4 +# n_components: 2 +# keep_fraction: 0.2 # one of keep fraction or n_components or variation_fraction allowed + variation_fraction: 0.98 + +mask: + file: /path/to/GA_data/GA-cover2/mask/old_mask_test.tif + retain: 1 pca: geotif: diff --git a/configs/reference_xgboost.yaml b/configs/reference_xgboost.yaml index 6c29d0e6..bf5fb8e0 100644 --- a/configs/reference_xgboost.yaml +++ b/configs/reference_xgboost.yaml @@ -1,26 +1,29 @@ - experiment: my_run patchsize: 0 memory_fraction: 0.5 features: - - name: my continuous features + - name: my continuous features 2 type: continuous files: - - path: configs/data/sirsam/er_depg.tif - - path: configs/data/sirsam/sagawet_b_sir.tif - path: configs/data/sirsam/dem_foc2.tif - path: configs/data/sirsam/outcrop_dis2.tif -# - path: configs/data/sirsam/k_15v5.tif - path: configs/data/sirsam/relief_apsect.tif -# - path: configs/data/sirsam/LATITUDE_GRID1.tif -# - path: configs/data/sirsam/LONGITUDE_GRID1.tif -# - directory: configs/data/weights/ transforms: - identity # - whiten: # keep_fraction: 0.98 - imputation: none + imputation: mean + - name: my continuous features 1 + type: continuous + files: + - path: configs/data/sirsam/er_depg.tif + - path: configs/data/sirsam/sagawet_b_sir.tif + transforms: + - identity + # - whiten: + # keep_fraction: 0.98 + imputation: gauss preprocessing: imputation: none @@ -125,7 +128,7 @@ prediction: outbands: 1 validation: - #- feature_rank +# - feature_rank - parallel - shapley # - permutation_importance @@ -135,9 +138,8 @@ validation: random_seed: 1 oos_validation: - file: configs/data/weights/Gamma_K_50.shp - property: K2O + file: configs/data/geochem_sites.shp + property: K_ppm_imp output: directory: ref_xgb/ - diff --git a/configs/resampling.yaml b/configs/resampling.yaml old mode 100755 new mode 100644 index 791831b0..c84cfc73 --- a/configs/resampling.yaml +++ b/configs/resampling.yaml @@ -2,8 +2,8 @@ # when undersample == true, we choose points with or without bootstrap targets: - file: configs/data/weights/Gamma_K_50.shp - property: K2O + file: configs/data/sirsam/out_resampled/geochem_sites.shp + property: K_ppm_imp resample: spatial: undersample: false diff --git a/scripts/check_covariates.py b/scripts/check_covariates.py new file mode 100644 index 00000000..117f556c --- /dev/null +++ b/scripts/check_covariates.py @@ -0,0 +1,61 @@ +import subprocess +from joblib import delayed, Parallel +from pathlib import Path +import rasterio as rio +from rasterio.io import DatasetReader +import numpy as np +import pandas as pd + +dir = "configs/data/sirsam" + +mask = Path(dir).joinpath("dem_foc2.tif") + +with rio.open(mask) as geotif: + mask_raster = geotif.read(1, masked=True) + + + +def _parallel_read(r: Path): + try: + with rio.open(r) as geotif: + raster: DatasetReader = geotif.read(1, masked=True) + m = geotif.meta + m['crs'] = m['crs'].to_string() + t = m.pop('transform') + m['pixsize_x'] = t[0] + m['top_left_x'] = t[2] + m['pixsize_y'] = -t[4] + m['top_left_y'] = t[5] + raster.mask = raster.mask | mask_raster.mask # we are not interested in masked areas + # print(raster) + m['all_finite'] = np.all(np.isfinite(raster)) + m['any_nan'] = np.any(np.isnan(raster)) + m['any_large'] = np.any(np.abs(raster) > 1e10) + m['min'] = np.ma.min(raster) + m['mean'] = np.ma.mean(raster) + m['median'] = np.ma.median(raster) + m['max'] = np.ma.max(raster) + m['std'] = np.ma.std(raster) + m['skew'] = 3 * (m['mean'] - m['median']) / m['std'] + # subprocess.run(f"gdalinfo {r.as_posix()} -stats", shell=True, capture_output=True) + # raster_attrs[r.stem] = m + return m + except Exception as e: + print(r) + print(e) + return [None] * 14 + + +rets = Parallel( + n_jobs=1, + verbose=100, +)(delayed(_parallel_read)(r) for r in Path(dir).glob("**/*.tif")) + +import pickle +with open("rets.pk", 'wb') as f: + pickle.dump(rets, f) + +raster_attrs = {r.stem: v for r, v in zip(Path(dir).glob("**/*.tif",), rets)} + +df = pd.DataFrame.from_dict(raster_attrs) +df.to_csv("quality.csv") diff --git a/scripts/dedupe_shape.py b/scripts/dedupe_shape.py new file mode 100644 index 00000000..42c2b440 --- /dev/null +++ b/scripts/dedupe_shape.py @@ -0,0 +1,70 @@ +from pathlib import Path +from joblib import Parallel, delayed +import numpy as np +import pandas as pd +import rasterio +import geopandas as gpd + + +def dedupe_raster(shp: Path, tif: Path, deduped_shp: Path): + """ + :param shp: input shapefile with dense points + :param tif: sample tif to read resolution details + :param deduped_shp: output shapefile with one point per down-sampled raster resolution + :return: + """ + print("====================================\n", f"deduping {shp.as_posix()}") + geom_cols = ['POINT_X', 'POINT_Y'] + pts = gpd.read_file(shp) + for g in geom_cols: + if g in pts.columns: + pts = pts.drop(g, axis=1) + coords = np.array([(p.x, p.y) for p in pts.geometry]) + geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) + pts = pts.merge(geom, left_index=True, right_index=True) + + with rasterio.open(tif) as src: + # resample data to target shape + data = src.read( + out_shape=( + src.count, + int(src.height / downscale_factor), + int(src.width / downscale_factor) + ), + resampling=rasterio.enums.Resampling.bilinear + ) + # scale image transform + transform = src.transform * src.transform.scale( + (src.width / data.shape[-1]), + (src.height / data.shape[-2]) + ) + pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1]) + + pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count')) + pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean() + pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) + + pts_deduped = gpd.GeoDataFrame(pts_deduped, + geometry=gpd.points_from_xy(pts_deduped['POINT_X'], pts_deduped['POINT_Y']), + crs="EPSG:3577" # Australian Albers + ) + pts_deduped.to_file(deduped_shp.as_posix()) + return pts_deduped + + +if __name__ == '__main__': + shapefiles = Path("configs/data/") + downscale_factor = 6 # keep 1 point in a 6x6 cell + + dem = Path('/home/my_dem.tif') + output_dir = Path('1in6') + output_dir.mkdir(exist_ok=True, parents=True) + + # for s in shapefiles.glob("*.shp"): + # deduped_shp = output_dir.joinpath(s.name) + # dedupe_raster(shp=s, tif=dem, deduped_shp=deduped_shp) + + Parallel( + n_jobs=-1, + verbose=100, + )(delayed(dedupe_raster)(s, dem, output_dir.joinpath(s.name)) for s in shapefiles.glob("geochem_sites.shp")) diff --git a/scripts/dedupe_with_median_scratch.py b/scripts/dedupe_with_median_scratch.py new file mode 100644 index 00000000..e0bbc3fb --- /dev/null +++ b/scripts/dedupe_with_median_scratch.py @@ -0,0 +1,293 @@ +from typing import Dict +import csv +from collections import defaultdict +from os import path +from pathlib import Path +import numpy as np +import pandas as pd +import rasterio +import geopandas as gpd +from joblib import Parallel, delayed + + +def read_list_file(list_path: str): + files = [] + csvfile = path.abspath(list_path) + with open(csvfile, 'r') as f: + reader = csv.reader(f) + tifs = list(reader) + tifs = [f[0].strip() for f in tifs + if (len(f) > 0 and f[0].strip() and + f[0].strip()[0] != '#')] + for f in tifs: + files.append(path.abspath(f)) + return files + + +def generate_key_val(gl, shorts): + eight_char_name = Path(gl).name[:8] + if eight_char_name not in shorts: + shorts[eight_char_name] += 1 + else: + shorts[eight_char_name] += 1 + return gl, eight_char_name + str(shorts[eight_char_name]) + + +# import IPython; IPython.embed(); import sys; sys.exit() + + +# data_location = \ +# Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/") +# Read points from shapefile + +# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/") + +# geotifs = { +# "relief_radius4.tif": "relief4", +# "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred", +# "MvrtpLL_smooth.tif": "mrvtpll_s", +# "MvrtpLL_fin.tif": "mvrtpll_f", +# "mrvbf_9.tif": "mrvbf_9", +# "Rad2016K_Th.tif": "rad2016kth", +# "Thorium_2016.tif": "th_2016", +# "Mesozoic_older_raster_MEAN.tif": "meso_mean", +# "LOC_distance_to_coast.tif": "loc_dis", +# "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb", +# "water-85m_1.tif": "water_85m", +# "clim_RSM_albers.tif": "clim_rsm", +# "tpi_300.tif": "tpi_300", +# "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir", +# "si_geol1.tif": "si_geol1", +# "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay", +# "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr", +# "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl", +# "Gravity_land.tif": "gravity", +# "dem_fill.tif": "dem", +# "Clim_Prescott_LindaGregory.tif": "clim_linda", +# "slope_fill2.tif": "slopefill2", +# "clim_PTA_albers.tif": "clim_alber", +# "SagaWET9cell_M.tif": "sagawet", +# "ceno_euc_aust1.tif": "ceno_euc", +# "s2-dpca-85m_band1.tif": "s2_band1", +# "s2-dpca-85m_band2.tif": "s2_band2", +# "s2-dpca-85m_band3.tif": "s2_band3", +# "3dem_mag0_finn.tif": "3dem_mag0", +# "3dem_mag1_fin.tif": "3dem_mag1", +# "3dem_mag2.tif": "3dem_mag2", +# } + +# geotifs = { +# "relief_apsect.tif": "relief4", +# "LATITUDE_GRID1.tif": "latitude", +# "LONGITUDE_GRID1.tif": "longitude", +# "er_depg.tif": "er_depg", +# "sagawet_b_sir.tif": "sagawet", +# "dem_foc2.tif": "dem_foc2", +# "outcrop_dis2.tif": "outcrop", +# "k_15v5.tif": "k_15v5", +# } + +majors = [ + "SIO2", + "TIO2", + "AL2O3", + "FE2O3TOT", + "FE2O3", + "FEO", + "MNO", + "MGO", + "CAO", + "NA2O", + "K2O", + "P2O5", +] + +def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], + dedupe: bool = False, + intersect_parallel: bool = False, + pixel_count: bool = False, + dedupe_with_median: bool = True + ): + print("====================================\n", f"intersecting {shp.as_posix()}") +pts = gpd.read_file(shp) +coords = np.array([(p.x, p.y) for p in pts.geometry]) +geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) +pts = pts.merge(geom, left_index=True, right_index=True) + if dedupe: + tif_name = list(geotifs.keys())[0] + tif = Path(tif_name) + # orig_cols = pts.columns + with rasterio.open(tif) as src: + # resample data to target shape + data = src.read( + out_shape=( + src.count, + int(src.height / downscale_factor), + int(src.width / downscale_factor) + ), + resampling=rasterio.enums.Resampling.bilinear + ) + # scale image transform + transform = src.transform * src.transform.scale( + (src.width / data.shape[-1]), + (src.height / data.shape[-2]) + ) + pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1]) + + # keep one only + # pts_deduped = pts.sort_values( + # by=geom_cols, ascending=[True, True] + # ).groupby(by=rows_and_cols, as_index=False).first()[orig_cols] + + # keep mean of repeated observations in a pixel + # + pts_count = pts.groupby(by=rows_and_cols, as_index=False).agg(px_count=('rows', 'count')) + # if dedupe_with_median: + # import IPython; IPython.embed(); import sys; sys.exit() + +rows_and_cols = ['rows', 'cols'] + +pts_majors = pts[majors + rows_and_cols + geom_cols + ['geometry']] +for m in majors: + pts[m] = pts[m].apply(lambda x: np.nan if x == 'NA' else float(x)) +pts_median = pts_majors[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).median() +pts_deduped = pts_median.merge(pts_count, how='inner', on=rows_and_cols) +rename_dict = {k: k + '_m' for k in majors} +pts_deduped = pts_deduped.rename(mapper=rename_dict, axis=1) +all_pts_with_count_and_median = pts.merge(pts_deduped, how='left', on=rows_and_cols) +for m in majors: + print(m) + all_pts_with_count_and_median[m + '_distance_from_median'] = \ + abs(all_pts_with_count_and_median[m]- all_pts_with_count_and_median[m + '_m'])/ \ + abs(all_pts_with_count_and_median[m + '_m']) + +def sum_distance_from_median(x): + s = np.nansum([v for k, v in x.items() if k.endswith('_distance_from_median')]) + return s + +all_pts_with_count_and_median['median_sum'] = all_pts_with_count_and_median.apply(lambda x: sum_distance_from_median( + x), axis=1) + +all_pts_with_count_and_median_min_median_sum = \ + all_pts_with_count_and_median.groupby(by=rows_and_cols, as_index=False).agg( + median_sum_min=('median_sum', 'min'), + median_sum_std=('median_sum', 'std') + ) + +all_pts = all_pts_with_count_and_median.merge(all_pts_with_count_and_median_min_median_sum, how='left', on=['rows', + 'cols']) + + +# all_pts['median_sum_to_median_min_diff'] = all_pts['median_sum'] - all_pts['median_sum_min'] + +# min_pts = all_pts.groupby(by=rows_and_cols, as_index=False).median_sum_to_median_min_diff.agg( +# ['min', 'std'] +# ) + +final_pts = all_pts[all_pts.median_sum==all_pts.median_sum_min] +final_pts_sorted = final_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True]) +all_pts_sorted = all_pts.sort_values(['px_count', 'rows', 'cols'], ascending=[False, True, True]) + +# all_pts.merge(min_pts, how="inner", on=['rows', 'cols', "median_sum_to_median_min_diff"]) + +# all_pts_majors_median_filtered = pts_count.merge(all_pts[all_pts.median_sum == all_pts.median_sum_min], +# how='left',on=rows_and_cols) + + else: # dedupe with mean + pts_mean = pts[majors + rows_and_cols].groupby(by=rows_and_cols, as_index=False).mean() + pts_deduped = pts_mean.merge(pts_count, how='inner', on=rows_and_cols) + # import IPython; IPython.embed(); import sys; sys.exit() + pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'], + pts_deduped['POINT_Y'])) + # pts_deduped = pts.drop_duplicates(subset=rows_and_cols)[orig_cols] + coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry]) + else: + pts_deduped = pts + coords_deduped = coords + + # for k, v in geotifs.items(): + # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + # pts_deduped[v] = __extract_raster_points(coords_deduped, k) + if intersect_parallel: + pts_deduped__ = Parallel( + n_jobs=-1, + verbose=100, + )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys()) + + for i, col_name in enumerate(geotifs.values()): + pts_deduped[col_name] = pts_deduped__[i] + else: + for i, (k, v) in enumerate(geotifs.items()): + print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + pts_deduped[v] = __extract_raster_points(coords_deduped, k) + + # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) + # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False) + return pts_deduped + + +def __extract_raster_points(raster, coords_deduped): + with rasterio.open(Path(raster)) as src: + print(f"---- intersecting {raster}--------") + return [x[0] for x in src.sample(coords_deduped)] + + +def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False, + intersect_parallel: bool = False): + df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel) + output_dir.mkdir(exist_ok=True, parents=True) + output_dir.joinpath('clean') + out_shp = output_dir.joinpath(shp.name) + df3 = df2[list(geotifs.values())] + df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] + df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] + try: + if df5.shape[0]: + df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) + print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") + if write_dropped: + df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) + else: + df2.to_file(out_shp.as_posix()) + print(f"saved intersected shapefile at {out_shp.as_posix()}") + print(f"No points dropped and there for _cleaned.shp file is not createed'") + print(f"No points dropped and there for _cleaned_dropped.shp file is not created") + except: + print(f"Check this shapefile {shp}") + + +if __name__ == '__main__': + # local + output_dir = Path('out_resampled') + shapefile_location = Path("configs/data") + downscale_factor = 2 # keep 1 point in a 2x2 cell + + geom_cols = ['POINT_X', 'POINT_Y'] + + covariastes_list = "configs/data/sirsam/covariates_list.txt" + geotifs_list = read_list_file(list_path=covariastes_list) + shorts = defaultdict(int) + + geotifs = {} + + for gl in geotifs_list: + k, v = generate_key_val(gl, shorts) + geotifs[k] = v + + # check all required files are available on disc + for k, v in geotifs.items(): + print(f"checking if {k} exists") + assert Path(k).exists() + + print("Add under the 'intersected_features:' section in yaml") + print('='*100) + for k, v in geotifs.items(): + print(f"\"{Path(k).name}\": \"{v}\"") + + print('='*100) + for s in shapefile_location.glob("*.shp"): + intersect_sample_and_clean(s, True, True) + # rets = Parallel( + # n_jobs=-1, + # verbose=100, + # )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp")) diff --git a/scripts/interpolate_shapes.py b/scripts/interpolate_shapes.py new file mode 100644 index 00000000..5140cb59 --- /dev/null +++ b/scripts/interpolate_shapes.py @@ -0,0 +1,68 @@ +from functools import partial +from pathlib import Path +from scipy import interpolate +import numpy as np +import pandas as pd +import geopandas as gpd +from joblib import Parallel, delayed + + +def interpolate_1d(row, thicknesses): + conductivities = row[conductivity_cols].values + f = interpolate.interp1d(thicknesses, conductivities, bounds_error=False, + fill_value=(conductivities[0], conductivities[-1]), assume_sorted=True) + new_conductivities = f(new_thicknesses) + return new_conductivities + + +def interpolate_shape(shp): + print(f"interpolating {shp}") + df = gpd.read_file(shp) + thicknesses = df.loc[0, thickness_cols].values.cumsum() + interpolate_1d_local = partial(interpolate_1d, thicknesses=thicknesses) + interpolated_conductivities = df.apply( + func=interpolate_1d_local, + axis=1, + raw=False, + ) + df.loc[:, conductivity_cols] = np.vstack(interpolated_conductivities) + df.loc[:, thickness_cols] = np.tile(new_thicknesses, (df.shape[0], 1)) + target_shp = output_dir.joinpath(Path(shp).name) + df.to_file(target_shp) + print(f"wrote interpolated shapefile {target_shp}") + + +if __name__ == '__main__': + + ref_df = gpd.read_file("/g/data/ge3/aem_sections/AEM_covariates/NT_1_albers.shp", rows=1) + + shapes_to_convert = [ + # "/g/data/ge3/data/AEM_shapes/AEM_ERC_ALL_interp_data_Albers_Ceno.shp", + "/g/data/ge3/data/AEM_shapes/East_C_1_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_1_2_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_1_3_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers_mod.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_2_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers_mod.shp", + "/g/data/ge3/data/AEM_shapes/East_C_2_3_albers.shp", + "/g/data/ge3/data/AEM_shapes/East_C_3_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_1_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_2_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_3_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_4_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_5_albers.shp", + "/g/data/ge3/data/AEM_shapes/frome_6_albers.shp", + "/g/data/ge3/data/AEM_shapes/Mundi_Mundi_albers.shp", + ] + output_dir = Path('interpolated') + output_dir.mkdir(exist_ok=True, parents=True) + + conductivity_cols = [c for c in ref_df.columns if c.startswith('cond_')] + thickness_cols = [t for t in ref_df.columns if t.startswith('thick_')] + new_thicknesses = np.array(ref_df.loc[0, thickness_cols].values.cumsum(), dtype=np.float32) + + rets = Parallel( + n_jobs=-1, + verbose=100, + )(delayed(interpolate_shape)(s) for s in shapes_to_convert) \ No newline at end of file diff --git a/scripts/intersect_raster_with_shape.py b/scripts/intersect_raster_with_shape.py new file mode 100644 index 00000000..b25a21f9 --- /dev/null +++ b/scripts/intersect_raster_with_shape.py @@ -0,0 +1,62 @@ +# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318_std.tif +# /g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif + + +import geopandas as gpd +import numpy as np +import pandas as pd +import rasterio as rio + + +# with rasterio.open(tif) as src: +# # resample data to target shape +# data = src.read( +# out_shape=( +# src.count, +# int(src.height / downscale_factor), +# int(src.width / downscale_factor) +# ), +# resampling=rasterio.enums.Resampling.bilinear +# ) +# # scale image transform +# transform = src.transform * src.transform.scale( +# (src.width / data.shape[-1]), +# (src.height / data.shape[-2]) +# ) +# pts["rows"], pts["cols"] = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1]) + + +def raster_points() -> gpd.GeoDataFrame: + with rio.Env(): + with rio.open( + '/g/data/ge3/sudipta/jobs/landshark_models/lake_eyre_cropped/04_11_2022/hw1_mean_318_5folds_complicated2/mean_318.tif') as src: + crs = src.crs + # create 1D coordinate arrays (coordinates of the pixel center) + xmin, ymax = np.around(src.xy(0.00, 0.00), 9) # src.xy(0, 0) + xmax, ymin = np.around(src.xy(src.height - 1, src.width - 1), 9) # src.xy(src.width-1, src.height-1) + x = np.linspace(xmin, xmax, src.width) + y = np.linspace(ymax, ymin, src.height) # max -> min so coords are top -> bottom + + # create 2D arrays + xs, ys = np.meshgrid(x, y) + zs = src.read(1) + + # Apply NoData mask + mask = src.read_masks(1) > 0 + xs, ys, zs = xs[mask], ys[mask], zs[mask] + data = {"X": pd.Series(xs.ravel()), + "Y": pd.Series(ys.ravel()), + "Z": pd.Series(zs.ravel())} + df = pd.DataFrame(data=data) + geometry = gpd.points_from_xy(df.X, df.Y) + gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry) + return gdf + + +raster_gdf = raster_points() + +shape_file = "" +shp_gdf = gpd.read_file(shape_file) +gpd.overlay(raster_gdf, shp_gdf, how="intersection") +print(raster_gdf.head()) + diff --git a/scripts/intersect_rasters.py b/scripts/intersect_rasters.py index f1fa77f0..ea50d789 100644 --- a/scripts/intersect_rasters.py +++ b/scripts/intersect_rasters.py @@ -30,73 +30,22 @@ def generate_key_val(gl, shorts): shorts[eight_char_name] += 1 else: shorts[eight_char_name] += 1 - return Path(gl).name, eight_char_name + str(shorts[eight_char_name]) - - -# import IPython; IPython.embed(); import sys; sys.exit() - - -# data_location = \ -# Path("/g/data/ge3/covariates/national_albers_filled_new/albers_cropped/") -# Read points from shapefile - -# shapefile_location = Path("/g/data/ge3/aem_sections/AEM_covariates/") - -# geotifs = { -# "relief_radius4.tif": "relief4", -# "national_Wii_RF_multirandomforest_prediction.tif": "mrf_pred", -# "MvrtpLL_smooth.tif": "mrvtpll_s", -# "MvrtpLL_fin.tif": "mvrtpll_f", -# "mrvbf_9.tif": "mrvbf_9", -# "Rad2016K_Th.tif": "rad2016kth", -# "Thorium_2016.tif": "th_2016", -# "Mesozoic_older_raster_MEAN.tif": "meso_mean", -# "LOC_distance_to_coast.tif": "loc_dis", -# "be-30y-85m-avg-ND-RED-BLUE.filled.lzw.nodata.tif": "be_av_rb", -# "water-85m_1.tif": "water_85m", -# "clim_RSM_albers.tif": "clim_rsm", -# "tpi_300.tif": "tpi_300", -# "be-30y-85m-avg-ND-SWIR1-NIR.filled.lzw.nodata.tif": "be_av_swir", -# "si_geol1.tif": "si_geol1", -# "be-30y-85m-avg-CLAY-PC2.filled.lzw.nodata.tif": "be_av_clay", -# "be-30y-85m-avg-GREEN.filled.lzw.nodata.tif": "be_av_gr", -# "be-30y-85m-avg_BLUE+SWIR2.tif": "be_av_bl", -# "Gravity_land.tif": "gravity", -# "dem_fill.tif": "dem", -# "Clim_Prescott_LindaGregory.tif": "clim_linda", -# "slope_fill2.tif": "slopefill2", -# "clim_PTA_albers.tif": "clim_alber", -# "SagaWET9cell_M.tif": "sagawet", -# "ceno_euc_aust1.tif": "ceno_euc", -# "s2-dpca-85m_band1.tif": "s2_band1", -# "s2-dpca-85m_band2.tif": "s2_band2", -# "s2-dpca-85m_band3.tif": "s2_band3", -# "3dem_mag0_finn.tif": "3dem_mag0", -# "3dem_mag1_fin.tif": "3dem_mag1", -# "3dem_mag2.tif": "3dem_mag2", -# } - -# geotifs = { -# "relief_apsect.tif": "relief4", -# "LATITUDE_GRID1.tif": "latitude", -# "LONGITUDE_GRID1.tif": "longitude", -# "er_depg.tif": "er_depg", -# "sagawet_b_sir.tif": "sagawet", -# "dem_foc2.tif": "dem_foc2", -# "outcrop_dis2.tif": "outcrop", -# "k_15v5.tif": "k_15v5", -# } - - -def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False): + return gl, eight_char_name + str(shorts[eight_char_name]) + + +def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = False, + intersect_parallel: bool = False): print("====================================\n", f"intersecting {shp.as_posix()}") pts = gpd.read_file(shp) + for g in geom_cols: + if g in pts.columns: + pts = pts.drop(g) coords = np.array([(p.x, p.y) for p in pts.geometry]) + geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) + pts = pts.merge(geom, left_index=True, right_index=True) if dedupe: - geom = pd.DataFrame(coords, columns=geom_cols, index=pts.index) - pts = pts.merge(geom, left_index=True, right_index=True) tif_name = list(geotifs.keys())[0] - tif = data_location.joinpath(tif_name) + tif = Path(tif_name) orig_cols = pts.columns with rasterio.open(tif) as src: # resample data to target shape @@ -124,40 +73,76 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = # pts_count = pts.groupby(by=['rows', 'cols'], as_index=False).agg(pixel_count=('rows', 'count')) pts_mean = pts.groupby(by=['rows', 'cols'], as_index=False).mean() - # import IPython; IPython.embed(); import sys; sys.exit() - pts_deduped = pts_mean.merge( - pts_count, how='inner', on=['rows', 'cols']).merge( - pts[['rows', 'cols', 'geometry']], how='inner', on=['rows', 'cols'] - ) + pts_deduped = pts_mean.merge(pts_count, how='inner', on=['rows', 'cols']) + + pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=gpd.points_from_xy(pts_deduped['POINT_X'], + pts_deduped['POINT_Y'])) # pts_deduped = pts.drop_duplicates(subset=['rows', 'cols'])[orig_cols] coords_deduped = np.array([(p.x, p.y) for p in pts_deduped.geometry]) else: pts_deduped = pts coords_deduped = coords - for k, v in geotifs.items(): - print(f"adding {k} to output dataframe") - with rasterio.open(data_location.joinpath(k)) as src: - pts_deduped[v] = [x[0] for x in src.sample(coords_deduped)] + # for k, v in geotifs.items(): + # print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + # pts_deduped[v] = __extract_raster_points(coords_deduped, k) + if intersect_parallel: + print("=====================Intersecting parallelly =======================") + pts_deduped__ = Parallel( + n_jobs=-1, + verbose=100, + )(delayed(__extract_raster_points)(r, coords_deduped) for r in geotifs.keys()) + + for i, col_name in enumerate(geotifs.values()): + pts_deduped[col_name] = pts_deduped__[i] + else: + for i, (k, v) in enumerate(geotifs.items()): + print(f"adding {i}/{len(geotifs)}: {k} to output dataframe") + pts_deduped[v] = __extract_raster_points(k, coords_deduped) - pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) - output_dir = Path('out_resampled') + # pts_deduped = gpd.GeoDataFrame(pts_deduped, geometry=pts_deduped.geometry) + # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False) + return pts_deduped + + +def __extract_raster_points(raster: str, coords_deduped: np.ndarray): + with rasterio.open(Path(raster)) as src: + print(f"---- intersecting {raster}--------") + return [x[0] for x in src.sample(coords_deduped)] + + +def intersect_sample_and_clean(shp, dedupe: bool = False, write_dropped: bool = False, + intersect_parallel: bool = False): + df2 = intersect_and_sample_shp(shp, geotifs, dedupe=dedupe, intersect_parallel=intersect_parallel) output_dir.mkdir(exist_ok=True, parents=True) + output_dir.joinpath('clean') out_shp = output_dir.joinpath(shp.name) - pts_deduped.to_file(out_shp.as_posix()) - print(f"saved intersected shapefile at {out_shp.as_posix()}") - # pts.to_csv(Path("out").joinpath(shp.stem + ".csv"), index=False) - return out_shp + df3 = df2[list(geotifs.values())] + finite_idices = ((np.isfinite(df3)).sum(axis=1) == len(geotifs)) & \ + ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)) + df4 = df2.loc[finite_idices, :] + df5 = df2.loc[~finite_idices, :] + try: + if df5.shape[0]: + df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) + print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") + if write_dropped: + df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) + else: + df2.to_file(out_shp.as_posix()) + print(f"saved intersected shapefile at {out_shp.as_posix()}") + print(f"No points dropped and there for _cleaned.shp file is not createed'") + print(f"No points dropped and there for _cleaned_dropped.shp file is not created") + except Exception as e: + print(e) + print(f"Check this shapefile {shp}") if __name__ == '__main__': # local - data_location = Path("configs/data/sirsam") - # tif_local = data_location.joinpath('LATITUDE_GRID1.tif') + output_dir = Path('out_resampled') shapefile_location = Path("configs/data") - shp = shapefile_location.joinpath('geochem_sites.shp') - - downscale_factor = 2 # keep 1 point in a 2x2 cell + downscale_factor = 1 # keep 1 point in a 2x2 cell geom_cols = ['POINT_X', 'POINT_Y'] @@ -171,28 +156,20 @@ def intersect_and_sample_shp(shp: Path, geotifs: Dict[str, str], dedupe: bool = k, v = generate_key_val(gl, shorts) geotifs[k] = v - for k, v in geotifs.items(): - print(f"\"{k}\": \"{v}\"") - # check all required files are available on disc for k, v in geotifs.items(): print(f"checking if {k} exists") - assert data_location.joinpath(k).exists() + assert Path(k).exists() - out_shp = intersect_and_sample_shp(shp, geotifs, dedupe=True) - df2 = gpd.GeoDataFrame.from_file(out_shp.as_posix()) - df3 = df2[list(geotifs.values())] - df4 = df2.loc[(df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs)), :] - df5 = df2.loc[~((df3.isna().sum(axis=1) == 0) & ((np.abs(df3) < 1e10).sum(axis=1) == len(geotifs))), :] - - df4.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')) - print(f"Wrote clean shapefile {out_shp.parent.joinpath(out_shp.stem + '_cleaned.shp')}") - if df5.shape[0]: - df5.to_file(out_shp.parent.joinpath(out_shp.stem + '_cleaned_dropped.shp')) - else: - print(f"No points dropped and there for _cleaned_dropped.shp file is not created") - -# rets = Parallel( - # n_jobs=-1, - # verbose=100, - # )(delayed(intersect_and_sample_shp)(s) for s in shapefile_location.glob("*.shp")) + print("Add under the 'intersected_features:' section in yaml") + print('='*100) + for k, v in geotifs.items(): + print(f"\"{Path(k).name}\": \"{v}\"") + + print('='*100) + for s in shapefile_location.glob("geochem_sites.shp"): + intersect_sample_and_clean(s, dedupe=True, write_dropped=True, intersect_parallel=True) + # rets = Parallel( + # n_jobs=-1, + # verbose=100, + # )(delayed(intersect_sample_and_clean)(s, True) for s in shapefile_location.glob("geochem_sites.shp")) diff --git a/scripts/plot_true_vs_prediction.py b/scripts/plot_true_vs_prediction.py index 819ad888..f7f4902a 100644 --- a/scripts/plot_true_vs_prediction.py +++ b/scripts/plot_true_vs_prediction.py @@ -99,3 +99,25 @@ def plot_quantiles(line): plt.show() + + +xx = line.d +y_test = line.Prediction +y_lower = line['Lower_quan'] +y_upper = line['Upper_quan'] +plt.plot(xx, line.y_true, "b.", markersize=10, label="Observed log10(conductivity)") +plt.plot(xx, line.out, "g", markersize=10, linestyle='-', + label="Kriged log10(conductivity)") +plt.plot(xx, y_test, color='orange', linestyle='-', + # marker='o', + label="Predicted median") +plt.plot(xx, y_upper, "k-") +plt.plot(xx, y_lower, "k-") +plt.fill_between( + # (95362.213934-line.d).ravel() + (line.d).ravel(), + y_lower, y_upper, alpha=0.4, label='Predicted 90% interval' +) +plt.legend(loc='upper left') +plt.ylabel('log10(conductivity)') +plt.xlabel('Distance along flight line (meters)') diff --git a/scripts/smooth_elevation.py b/scripts/smooth_elevation.py new file mode 100644 index 00000000..32dccbda --- /dev/null +++ b/scripts/smooth_elevation.py @@ -0,0 +1,64 @@ +from pathlib import Path +import numpy as np +from scipy.interpolate import interp2d, RegularGridInterpolator +import rasterio as rio +from rasterio.io import DatasetReader +smooth_dem = Path("/home/sudipta/Documents/nci/smooth_dem") +elevation = smooth_dem.joinpath("elevation_small.tif") +ceno = smooth_dem.joinpath("ceno_small.tif") +relief = smooth_dem.joinpath("relief_small.tif") + +with rio.open(elevation) as geotif: + profile: rio.profiles.Profile = geotif.profile + ele: DatasetReader = geotif.read(1, masked=True) + +with rio.open(relief) as geotif: + rel: DatasetReader = geotif.read(1, masked=True) + +with rio.open(ceno) as geotif: + cen: DatasetReader = geotif.read(1, masked=True) + +ele_mod = np.ma.array(data=ele.data, mask=(ele.mask | ((cen.data > 0.01) & (rel.data > 2.3))), fill_value=np.nan) +ele_mod.data[ele_mod.mask] = np.nan +# ele_mod.mask = ele.mask | ((cen.data > 0.01) & (rel.data > 0.01)) + + +def points_removed_interp(): + z = ele_mod + x, y = np.mgrid[0:z.shape[0], 0:z.shape[1]] + x1 = x[~z.mask] + y1 = y[~z.mask] + z1 = z[~z.mask] + print(z1.shape) + return interp2d(x1, y1, z1, kind="linear")(np.arange(z.shape[0]), np.arange(z.shape[1])) + + +def recatangular_interp(): + z = ele_mod.filled(np.nan) + x = np.array(range(ele_mod.shape[0])) + y = np.array(range(ele_mod.shape[1])) + zinterp = RegularGridInterpolator((x, y), z, method="linear") + X2, Y2 = np.meshgrid(x, y) + newpoints = np.array((X2, Y2)).T + + # actual interpolation + z2 = zinterp(newpoints) + z2_masked = np.ma.array(z2, mask=np.isnan(z2)) + return z2_masked + + +profile.update(compress='lzw') + +with rio.open(smooth_dem.joinpath("smooth_elevation_small_masked_ref_2p3.tif"), mode="w", **profile,) as update_dataset: + # out = recatangular_interp() + # out = points_removed_interp() + print(profile) + update_dataset.write(ele_mod, 1) + + +# import IPython; IPython.embed(); import sys; sys.exit() +# flat_ele = interp2d(x1, y1, z1) +# (np.arange(ele_mod.shape[0]), np.arange(ele_mod.shape[1])) + + + diff --git a/setup.py b/setup.py index 068891c6..b14d4aba 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def run(self): 'revrand >= 0.9.10', 'mpi4py==3.1.3', 'scipy==1.6.2', - 'scikit-learn~=0.22.2', + 'scikit-learn==0.22.2.post1', 'scikit-image==0.19.1', 'scikit-optimize == 0.8.1', 'wheel >= 0.29.0', diff --git a/uncoverml/config.py b/uncoverml/config.py index 9abee74d..f7a67ce4 100644 --- a/uncoverml/config.py +++ b/uncoverml/config.py @@ -164,15 +164,20 @@ def __init__(self, yaml_file): self.krige = self.algorithm == 'krige' if 'prediction' in s: - self.quantiles = s['prediction']['quantiles'] - self.geotif_options = s['prediction']['geotif'] if 'geotif' in \ - s['prediction'] else {} + prediction = s["prediction"] + self.quantiles = prediction['quantiles'] + self.geotif_options = prediction['geotif'] if 'geotif' in prediction else {} self.outbands = None - if 'outbands' in s['prediction']: - self.outbands = s['prediction']['outbands'] - self.thumbnails = s['prediction']['thumbnails'] \ + if 'outbands' in prediction: + self.outbands = prediction['outbands'] + self.thumbnails = prediction['thumbnails'] \ if 'thumbnails' in s['prediction'] else 10 + self.prediction_template = prediction["prediction_template"] \ + if "prediction_template" in prediction else None + + self.is_prediction = False + if 'features' in s: self.pickle = any(True for d in s['features'] if d['type'] == 'pickle') else: @@ -292,6 +297,7 @@ def __init__(self, yaml_file): self.split_oos_fraction = s['targets']['split']['oos_fraction'] self.train_shapefile = Path(self.output_dir).joinpath(s['output']['train_shapefile']) self.oos_shapefile = Path(self.output_dir).joinpath(s['output']['oos_shapefile']) + self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp') self.mask = None if 'mask' in s: @@ -303,6 +309,10 @@ def __init__(self, yaml_file): preprocessing_transforms = s['preprocessing']['transforms'] if 'n_components' in preprocessing_transforms[0]['whiten']: self.n_components = preprocessing_transforms[0]['whiten']['n_components'] + elif 'keep_fraction' in preprocessing_transforms[0]['whiten']: + self.keep_fraction = preprocessing_transforms[0]['whiten']['keep_fraction'] + elif 'variation_fraction' in preprocessing_transforms[0]['whiten']: + self.variation_fraction = preprocessing_transforms[0]['whiten']['variation_fraction'] else: self.n_components = None if 'geotif' not in s['pca']: @@ -388,14 +398,12 @@ def __init__(self, yaml_file): else self.name + ('.cluster' if self.clustering else '.model') self.model_file = Path(self.output_dir).joinpath(output_model) - if self.optimised_model: - self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp') - self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv') - self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv') - self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json") - self.optimised_model_file = Path(self.output_dir).joinpath(self.name + "_optimised.model") - self.outfile_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json") - self.optimised_model_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json") + self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv') + self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv') + self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json") + self.optimised_model_file = Path(self.output_dir).joinpath(self.name + "_optimised.model") + self.outfile_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json") + self.optimised_model_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json") class ConfigException(Exception): diff --git a/uncoverml/features.py b/uncoverml/features.py index bcbe4880..161e3804 100644 --- a/uncoverml/features.py +++ b/uncoverml/features.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from collections import OrderedDict import numpy as np import pickle @@ -11,14 +12,16 @@ from uncoverml import patch from uncoverml import transforms from uncoverml.config import Config +from uncoverml.geoio import RasterioImageSource log = logging.getLogger(__name__) -def extract_subchunks(image_source, subchunk_index, n_subchunks, patchsize): +def extract_subchunks(image_source: RasterioImageSource, subchunk_index, n_subchunks, patchsize, + template_source: Optional[RasterioImageSource] = None): equiv_chunks = n_subchunks * mpiops.chunks equiv_chunk_index = mpiops.chunks*subchunk_index + mpiops.chunk_index - image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize) + image = Image(image_source, equiv_chunk_index, equiv_chunks, patchsize, template_source) x = patch.all_patches(image, patchsize) return x @@ -87,7 +90,9 @@ def transform_features(feature_sets, transform_sets, final_transform, config): if mpiops.chunk_index == 0 and config.pickle: log.info('Saving featurevec for reuse') pickle.dump(feature_vec, open(config.featurevec, 'wb')) - + if mpiops.chunk_index == 0: + for i, f in enumerate(feature_sets[0].keys()): + log.debug(f"Using feature num {i} from: {f}") x = np.ma.concatenate(transformed_vectors, axis=1) if config.cubist or config.multicubist or config.krige: log.warning("{}: Ignoring preprocessing " @@ -170,9 +175,15 @@ def cull_all_null_rows(feature_sets): bool_transformed_vectors = np.concatenate([t.mask for t in transformed_vectors], axis=1) - covaraiates = bool_transformed_vectors.shape[1] - rows_to_keep = np.sum(bool_transformed_vectors, axis=1) != covaraiates - return rows_to_keep + data_transformed_vectors = np.concatenate([t.data for t in + transformed_vectors], axis=1) + num_covariates = bool_transformed_vectors.shape[1] + + rows_with_at_least_one_cov_unmasked = np.sum(bool_transformed_vectors, axis=1) != num_covariates + # good rows are any covariate unmasked and all finite covariates + rows_with_all_finite_covariates = np.isfinite(data_transformed_vectors).sum(axis=1) == num_covariates + good_rows = rows_with_all_finite_covariates & rows_with_at_least_one_cov_unmasked + return good_rows def gather_features(x, node=None): diff --git a/uncoverml/geoio.py b/uncoverml/geoio.py index 38a9918c..33ee3b4e 100644 --- a/uncoverml/geoio.py +++ b/uncoverml/geoio.py @@ -1,4 +1,5 @@ from __future__ import division +from typing import Optional import joblib import os.path from pathlib import Path @@ -82,7 +83,9 @@ def __init__(self, filename): self.filename = filename assert os.path.isfile(filename), '{} does not exist'.format(filename) + with rasterio.open(self.filename, 'r') as geotiff: + self._full_res = (geotiff.width, geotiff.height, geotiff.count) self._nodata_value = geotiff.meta['nodata'] # we don't support different channels with different dtypes @@ -284,14 +287,17 @@ def load_targets(shapefile, targetfield, conf: Config): return targets -def get_image_spec(model, config): +def get_image_spec(model, config: Config): # temp workaround, we should have an image spec to check against nchannels = len(model.get_predict_tags()) return get_image_spec_from_nchannels(nchannels, config) -def get_image_spec_from_nchannels(nchannels, config): - imagelike = config.feature_sets[0].files[0] +def get_image_spec_from_nchannels(nchannels, config: Config): + if config.prediction_template and config.is_prediction: + imagelike = Path(config.prediction_template).absolute() + else: + imagelike = config.feature_sets[0].files[0] template_image = image.Image(RasterioImageSource(imagelike)) eff_shape = template_image.patched_shape(config.patchsize) + (nchannels,) eff_bbox = template_image.patched_bbox(config.patchsize) @@ -425,7 +431,7 @@ def output_thumbnails(self, ratio=10): resample(f, output_tif=thumbnail, ratio=ratio) -def feature_names(config): +def feature_names(config: Config): results = [] for s in config.feature_sets: @@ -438,9 +444,12 @@ def feature_names(config): return results -def _iterate_sources(f, config): +def _iterate_sources(f, config: Config): results = [] + template_tif = config.prediction_template if config.is_prediction else None + if config.is_prediction: + log.info(f"Using prediction template {config.prediction_template}") for s in config.feature_sets: extracted_chunks = {} for tif in s.files: @@ -448,18 +457,7 @@ def _iterate_sources(f, config): name = os.path.abspath(tif) image_source = RasterioImageSource(tif) x = f(image_source) - # TODO this may hurt performance. Consider removal - if type(x) is np.ma.MaskedArray: - count = mpiops.count(x) - # if not np.all(count > 0): - # s = ("{} has no data in at least one band.".format(name) + - # " Valid_pixel_count: {}".format(count)) - # raise ValueError(s) - missing_percent = missing_percentage(x) - t_missing = mpiops.comm.allreduce( - missing_percent) / mpiops.chunks - log.info("{}: {}px {:2.2f}% missing".format( - name, count, t_missing)) + log_missing_percentage(name, x) extracted_chunks[name] = x extracted_chunks = OrderedDict(sorted( extracted_chunks.items(), key=lambda t: t[0])) @@ -468,6 +466,21 @@ def _iterate_sources(f, config): return results +def log_missing_percentage(name, x): + # TODO this may hurt performance. Consider removal + if type(x) is np.ma.MaskedArray: + count = mpiops.count(x) + # if not np.all(count > 0): + # s = ("{} has no data in at least one band.".format(name) + + # " Valid_pixel_count: {}".format(count)) + # raise ValueError(s) + missing_percent = missing_percentage(x) + t_missing = mpiops.comm.allreduce( + missing_percent) / mpiops.chunks + log.info("{}: {}px {:2.2f}% missing".format( + name, count, t_missing)) + + def image_resolutions(config): def f(image_source): r = image_source._full_res @@ -477,10 +490,16 @@ def f(image_source): return result -def image_subchunks(subchunk_index, config): +def image_subchunks(subchunk_index, config: Config): + """This is used in prediction only""" - def f(image_source): - r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize) + def f(image_source: RasterioImageSource): + if config.is_prediction and config.prediction_template is not None: + template_source = RasterioImageSource(config.prediction_template) + else: + template_source = None + r = features.extract_subchunks(image_source, subchunk_index, config.n_subchunks, config.patchsize, + template_source=template_source) return r result = _iterate_sources(f, config) return result @@ -538,8 +557,9 @@ def f(image_source): patchsize=config.patchsize) if frac < 1.0: np.random.seed(1) - r = r[np.random.rand(r.shape[0]) < frac] - return r + rr = r[np.random.rand(r.shape[0]) < frac] + log.info(f"sampled {rr.shape[0]} from max possible {r.shape[0]}") + return rr result = _iterate_sources(f, config) return result diff --git a/uncoverml/image.py b/uncoverml/image.py index 395c23c9..0f4494d4 100644 --- a/uncoverml/image.py +++ b/uncoverml/image.py @@ -1,7 +1,9 @@ +from typing import Optional import numpy as np import logging from affine import Affine +from uncoverml.geoio import RasterioImageSource log = logging.getLogger(__name__) @@ -27,7 +29,10 @@ def construct_splits(npixels, nchunks, overlap=0): class Image: - def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0): + def __init__(self, source: RasterioImageSource, + chunk_idx=0, nchunks=1, overlap=0, + t_source: Optional[RasterioImageSource] = None, + ): assert chunk_idx >= 0 and chunk_idx < nchunks if nchunks == 1 and overlap != 0: @@ -48,6 +53,10 @@ def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0): self.pixsize_x = source.pixsize_x self.pixsize_y = source.pixsize_y self.crs = source.crs + log.debug(f"Image full resolution : {source.full_resolution}") + log.debug(f"Image origin longitude : {source.origin_longitude}") + log.debug(f"Image origin latitude : {source.origin_latitude}") + assert self.pixsize_x > 0 assert self.pixsize_y > 0 @@ -64,21 +73,29 @@ def __init__(self, source, chunk_idx=0, nchunks=1, overlap=0): self._pix_y_to_coords = dict(zip(pix_y, coords_y)) # exclusive y range of this chunk in full image - ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx] - self._offset = np.array([0, ymin], dtype=int) - # exclusive x range of this chunk (same for all chunks) - xmin, xmax = 0, self._full_res[0] + if t_source: + self._t_full_res = t_source.full_resolution + self._t_start_lon = t_source.origin_longitude + self._t_start_lat = t_source.origin_latitude + self._t_pixsize_x = t_source.pixsize_x + self._t_pixsize_y = t_source.pixsize_y + tymin, tymax = construct_splits(self._t_full_res[1], nchunks, overlap)[chunk_idx] + t_x_offset = np.searchsorted(self._coords_x, self._t_start_lon) + t_y_offset = np.searchsorted(self._coords_y, self._t_start_lat) + xmin, xmax = t_x_offset, t_x_offset + self._t_full_res[0] + ymin, ymax = t_y_offset + tymin, t_y_offset + tymax + self.resolution = (xmax-xmin, ymax - ymin, self._t_full_res[2]) + else: + # exclusive x range of this chunk (same for all chunks) + xmin, xmax = 0, self._full_res[0] + ymin, ymax = construct_splits(self._full_res[1], nchunks, overlap)[chunk_idx] + self.resolution = (xmax - xmin, ymax - ymin, self._full_res[2]) + self._offset = np.array([xmin, ymin], dtype=int) assert(xmin < xmax) assert(ymin < ymax) - # get resolution of this chunk - xres = self._full_res[0] - yres = ymax - ymin - # Calculate the new values for resolution and bounding box - self.resolution = (xres, yres, self._full_res[2]) - start_bound_x, start_bound_y = self._global_pix2lonlat( np.array([[xmin, ymin]]))[0] # one past the last pixel diff --git a/uncoverml/mpiops.py b/uncoverml/mpiops.py index ad17814f..5c8e37a4 100644 --- a/uncoverml/mpiops.py +++ b/uncoverml/mpiops.py @@ -146,7 +146,11 @@ def outer(x): out = comm.allreduce(x_outer_local) still_masked = np.ma.count_masked(out) if still_masked != 0: - log.info('Reported out: ' + ', '.join([str(s) for s in out])) + log.info("=========still_masked =================") + if chunk_index == 0: + for s in out: + log.info(s) + # log.info('Reported out: ' + ', '.join([str(s) for s in out])) raise ValueError("Can't compute outer product:" " completely missing columns!") if hasattr(out, 'mask'): @@ -161,6 +165,7 @@ def covariance(x): def eigen_decomposition(x): + log.info(f"Eigenvalue matrix shape in process {chunk_index}: {x.shape}") eigvals, eigvecs = np.linalg.eigh(covariance(x)) return eigvals, eigvecs diff --git a/uncoverml/patch.py b/uncoverml/patch.py index 028cce5d..e7e7f4df 100644 --- a/uncoverml/patch.py +++ b/uncoverml/patch.py @@ -5,6 +5,7 @@ import numpy as np import skimage +from uncoverml.image import Image log = logging.getLogger(__name__) @@ -80,7 +81,7 @@ def point_patches(image, pwidth, points): return output -def _image_to_data(image): +def _image_to_data(image: Image): """ breaks up an image object into arrays suitable for sending to the patching functions @@ -92,7 +93,7 @@ def _image_to_data(image): return data, mask, data_dtype -def all_patches(image, patchsize): +def all_patches(image: Image, patchsize): data, mask, data_dtype = _image_to_data(image) patches = grid_patches(data, patchsize) patch_mask = grid_patches(mask, patchsize) diff --git a/uncoverml/predict.py b/uncoverml/predict.py index df06747a..1cc8d529 100644 --- a/uncoverml/predict.py +++ b/uncoverml/predict.py @@ -13,6 +13,7 @@ from uncoverml.krige import krig_dict from uncoverml import transforms from uncoverml.config import Config +from uncoverml.hyopt import NpEncoder log = logging.getLogger(__name__) float32finfo = np.finfo(dtype=np.float32) @@ -128,8 +129,7 @@ def _fix_for_corrupt_data(x, feature_names): x.data[min_mask] = float32finfo.min x.data[max_mask] = float32finfo.max x.mask += min_mask + max_mask - problem_feature_names = \ - list(compress(feature_names, ~isfinite.all(axis=0))) + problem_feature_names = list(compress(feature_names, ~isfinite.all(axis=0))) print("Warning: Float64 data was truncated to float32 max/min values." "Check your covariates for possible nodatavalue, datatype, " @@ -144,31 +144,22 @@ def _get_data(subchunk, config): # NOTE: This returns an *untransformed* x, # which is ok as we just need dummies here - if config.mask: - mask_x = _mask(subchunk, config) - all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x)) - if all_mask_x.shape[0] == np.sum(all_mask_x.mask): - x = np.ma.zeros((mask_x.shape[0], len(features_names)), - dtype=np.bool) - x.mask = True - log.info('Partition {} covariates are not loaded as ' - 'the partition is entirely masked.'.format(subchunk + 1)) - return x, features_names + # if config.mask: + # mask_x = _mask(subchunk, config) + # all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x)) + # if all_mask_x.shape[0] == np.sum(all_mask_x.mask): + # x = np.ma.zeros((mask_x.shape[0], len(features_names)), + # dtype=np.bool) + # x.mask = True + # log.info('Partition {} covariates are not loaded as ' + # 'the partition is entirely masked.'.format(subchunk + 1)) + # return x, features_names transform_sets = [k.transform_set for k in config.feature_sets] extracted_chunk_sets = geoio.image_subchunks(subchunk, config) log.info("Applying feature transforms") x = features.transform_features(extracted_chunk_sets, transform_sets, config.final_transform, config)[0] - # only check/correct float32 conversion for Ensemble models - if config.pca: - x = _fix_for_corrupt_data(x, features_names) - return _mask_rows(x, subchunk, config), features_names - - if not config.clustering: - if (isinstance(modelmaps[config.algorithm](), BaseEnsemble) or - config.multirandomforest): - x = _fix_for_corrupt_data(x, features_names) return _mask_rows(x, subchunk, config), features_names @@ -213,12 +204,12 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf log.info("Loaded {:2.4f}GB of image data".format(total_gb)) alg = config.algorithm log.info("Predicting targets for {}.".format(alg)) - if hasattr(config, 'quantiles'): - y_star = predict(x, model, interval=config.quantiles, - lon_lat=_get_lon_lat(subchunk, config)) - else: - y_star = predict(x, model, lon_lat=_get_lon_lat(subchunk, config)) - + try: + y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config)) + except ValueError as v: + log.warning(v) + x = _fix_for_corrupt_data(x, feature_names) + y_star = predict(x, model, interval=config.quantiles, lon_lat=_get_lon_lat(subchunk, config)) if config.cluster and config.cluster_analysis: cluster_analysis(x, y_star, subchunk, config, feature_names) # cluster_analysis(x, y_star, subchunk, config, feature_names) @@ -227,17 +218,20 @@ def render_partition(model, subchunk, image_out: geoio.ImageWriter, config: Conf def export_pca(subchunk, image_out: geoio.ImageWriter, config: Config): x, _ = _get_data(subchunk, config) - mpiops.run_once(export_pca_fractions, config) total_gb = mpiops.comm.allreduce(x.nbytes / 1e9) log.info("Loaded {:2.4f}GB of image data".format(total_gb)) log.info("Extracting PCAs....") - image_out.write(np.flip(x, axis=1), subchunk) + # reverse x along columns which are eigen vectors so that the eigenvector corresponding to the largest eigenvalue + # is the first PC. This will hopefully remove some ambiguity + x = apply_masked(lambda _x: _x, np.flip(x, axis=1)) + image_out.write(x, subchunk) -def export_pca_fractions(config): +def export_pca_fractions(config: Config): whiten_transform = config.final_transform.global_transforms[0] with open(config.pca_json, 'w') as output_file: - json.dump(whiten_transform.explained_ratio, output_file, sort_keys=True, indent=4) + json.dump(whiten_transform.__dict__, output_file, sort_keys=True, indent=4, cls=NpEncoder) + log.info(f"wrote pc contributions/whiten transform stats in {config.pca_json}") def cluster_analysis(x, y, partition_no, config, feature_names): diff --git a/uncoverml/scripts/uncoverml.py b/uncoverml/scripts/uncoverml.py index ebea5f57..598e7b43 100644 --- a/uncoverml/scripts/uncoverml.py +++ b/uncoverml/scripts/uncoverml.py @@ -14,6 +14,10 @@ from pathlib import Path import warnings +warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action='ignore', category=DeprecationWarning) +warnings.filterwarnings(action='ignore', category=Warning) + import click import numpy as np import matplotlib @@ -39,9 +43,6 @@ log = logging.getLogger(__name__) # warnings.showwarning = warn_with_traceback -warnings.filterwarnings(action='ignore', category=FutureWarning) -warnings.filterwarnings(action='ignore', category=DeprecationWarning) -warnings.filterwarnings(action='ignore', category=Warning) @click.group() @@ -339,16 +340,20 @@ def validate(pipeline_file, model_or_cluster_file, partitions): help='mask file used to limit prediction area') @click.option('-r', '--retain', type=int, default=None, help='mask values where to predict') -def predict(model_or_cluster_file, partitions, mask, retain): +@click.option('-t', '--prediction_template', type=click.Path(exists=True), default=None, + help='mask values where to predict') +def predict(model_or_cluster_file, partitions, mask, retain, prediction_template): with open(model_or_cluster_file, 'rb') as f: state_dict = joblib.load(f) model = state_dict["model"] config = state_dict["config"] + config.is_prediction = True config.cluster = True if splitext(model_or_cluster_file)[1] == '.cluster' \ else False config.mask = mask if mask else config.mask + config.prediction_template = prediction_template if prediction_template else config.prediction_template if config.mask: config.retain = retain if retain else config.retain @@ -414,14 +419,20 @@ def predict(model_or_cluster_file, partitions, mask, retain): @click.argument('pipeline_file') @click.option('-p', '--partitions', type=int, default=1, help='divide each node\'s data into this many partitions') +@click.option('-s', '--subsample_fraction', type=float, default=1.0, + help='only use this fraction of the data for learning classes') @click.option('-m', '--mask', type=str, default='', help='mask file used to limit prediction area') @click.option('-r', '--retain', type=int, default=None, help='mask values where to predict') -def pca(pipeline_file, partitions, mask, retain): +def pca(pipeline_file, partitions, subsample_fraction, mask, retain): config = ls.config.Config(pipeline_file) + + __validate_pca_config(config) # no other transforms other than whiten assert config.pca, "Not a pca analysis. Please include the pca block in your yaml!!" config.mask = mask if mask else config.mask + + config.subsample_fraction = subsample_fraction if config.mask: config.retain = retain if retain else config.retain @@ -432,24 +443,37 @@ def pca(pipeline_file, partitions, mask, retain): config.n_subchunks = partitions if config.n_subchunks > 1: - log.info("Memory contstraint forcing {} iterations " - "through data".format(config.n_subchunks)) + log.info("Memory contstraint forcing {} iterations through data".format(config.n_subchunks)) else: log.info("Using memory aggressively: dividing all data between nodes") - image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(config.n_components, config) - tr_whiten = __validate_pca_config(config) + # Get the image chunks and their associated transforms + image_chunk_sets = ls.geoio.unsupervised_feature_sets(config) + transform_sets = [k.transform_set for k in config.feature_sets] + + features, _ = ls.features.transform_features(image_chunk_sets, + transform_sets, + config.final_transform, + config) + features, _ = ls.features.remove_missing(features) + num_samples = np.sum(ls.mpiops.comm.gather(features.shape[0])) + log.info(f"Extracting the top {features.shape[1]} PCs from a random sampling of" + f" {num_samples} points from the rasters") + log.info(f"Done whiten tranform with {subsample_fraction*100}% of all data") + ls.mpiops.run_once(ls.geoio.export_cluster_model, "dummy_model", config) + ls.mpiops.run_once(ls.predict.export_pca_fractions, config) + + whiten_transform = config.final_transform.global_transforms[0] + image_shape, image_bbox, image_crs = ls.geoio.get_image_spec_from_nchannels(whiten_transform.keepdims, config) outfile_tif = config.name + "_pca" image_out = ls.geoio.ImageWriter(image_shape, image_bbox, image_crs, outfile_tif, config.n_subchunks, config.output_dir, - band_tags=[f'_pc_{n}' for n in range(1, config.n_components+1)], + band_tags=[f'pc_{n}' for n in range(1, whiten_transform.keepdims+1)], **config.geotif_options) - for i in range(config.n_subchunks): log.info("starting to render partition {}".format(i+1)) - # TODO: ideally want to take a random sample of each covariate and compute whiten stats ls.predict.export_pca(i, image_out, config) # explicitly close output rasters @@ -459,6 +483,7 @@ def pca(pipeline_file, partitions, mask, retain): def __validate_pca_config(config): + # assert no other transforms other than whiten transform_sets = [k.transform_set for k in config.feature_sets] for t in transform_sets: assert len(t.image_transforms) == 0 # validation that there are no image or global transforms diff --git a/uncoverml/transforms/linear.py b/uncoverml/transforms/linear.py index 2fc0e7b8..0017da2f 100644 --- a/uncoverml/transforms/linear.py +++ b/uncoverml/transforms/linear.py @@ -1,7 +1,8 @@ - +import logging import numpy as np from uncoverml import mpiops +log = logging.getLogger(__name__) class CentreTransform: @@ -79,36 +80,43 @@ def __call__(self, *args): class WhitenTransform: - def __init__(self, keep_fraction=None, n_components=None): + def __init__(self, keep_fraction=None, n_components=None, variation_fraction=None): self.mean = None self.eigvals = None self.eigvecs = None self.keep_fraction = keep_fraction self.n_components = n_components - if n_components is None: - assert keep_fraction is not None - if keep_fraction is None: - assert n_components is not None - assert (keep_fraction is None) or (n_components is None) - self.explained_ratio = None + self.variation_fraction = variation_fraction + null_check = [x is None for x in [keep_fraction, n_components, variation_fraction]] + assert np.sum(list(null_check)) == 2, "only one of keep_fraction, n_components, " \ + "variation_fraction can be non null" + self.explained_ratio = {} def __call__(self, x): x = x.astype(float) if self.mean is None or self.eigvals is None or self.eigvecs is None: self.mean = mpiops.mean(x) + num_points = np.sum(mpiops.comm.gather(x.shape[0])) + num_covariates = x.shape[1] + log.info(f"Matrix for eigenvalue decomposition has shape {num_points, num_covariates}") self.eigvals, self.eigvecs = mpiops.eigen_decomposition(x) ndims = x.shape[1] # make sure 1 <= keepdims <= ndims - if self.n_components is None: + self.explained_ratio = { + ndims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-ndims:])/np.sum(np.abs( + self.eigvals))) + } + + if self.n_components is None and self.variation_fraction is None: self.keepdims = min(max(1, int(ndims * self.keep_fraction)), ndims) + elif self.n_components is None and self.keep_fraction is None: + explained_ratio_cum_sum = np.cumsum(list(self.explained_ratio.values())[::-1]) + self.keepdims = np.searchsorted(explained_ratio_cum_sum > self.variation_fraction*100, True) + 1 + log.info(f"Variation percentage {self.variation_fraction*100} requires only {self.keepdims} PCs") else: self.keepdims = self.n_components assert self.n_components < ndims, "More components demanded than features. Not possible! \n " \ "Please reduce n_components or increase the number of features" - self.explained_ratio = { - self.keepdims - i: r * 100 for i, r in enumerate(np.abs(self.eigvals[-self.keepdims:])/np.sum(np.abs( - self.eigvals))) - } mat = self.eigvecs[:, - self.keepdims:] vec = self.eigvals[np.newaxis, - self.keepdims:] diff --git a/uncoverml/validate.py b/uncoverml/validate.py index 54b82a86..0e42a946 100644 --- a/uncoverml/validate.py +++ b/uncoverml/validate.py @@ -374,14 +374,16 @@ def setup_validation_data(X, targets_all, cv_folds, random_state=None): random_state=random_state) rows_with_at_least_one_masked = ~ np.any(X.mask, axis=1) - X = X[rows_with_at_least_one_masked, :] - w = w[rows_with_at_least_one_masked] - y = y[rows_with_at_least_one_masked] - lon_lat = lon_lat[rows_with_at_least_one_masked, :] - groups = groups[rows_with_at_least_one_masked] + finite_X = np.isfinite(X.data).sum(axis=1) == X.shape[1] + valid_rows = rows_with_at_least_one_masked & finite_X + X = X[valid_rows, :] + w = w[valid_rows] + y = y[valid_rows] + lon_lat = lon_lat[valid_rows, :] + groups = groups[valid_rows] for (f, v), a in zip(targets_all.fields.items(), arrays): - targets_all.fields[f] = a[rows_with_at_least_one_masked] + targets_all.fields[f] = a[valid_rows] if len(np.unique(groups)) >= cv_folds: log.info(f'Using GroupKFold with {cv_folds} folds')