In [2]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")
os.chdir(code_dir)



import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
import geopandas as gpd
import shapely
import fiona

import pandas as pd
import rasterio
import rasterio.mask
from rasterio import warp


from analysis.NL_feature_creation_and_other_NL_processing import nl_helpers

from analysis.prediction_utils import flatten_raster,upscale_grid_vector

## Make Non-linear NL features from 2019 VIIRS composite. We make features for ADM1, ADM0, ADM2, and DHS polygons

We use the annual 2019 VIIRS composite V2.1 average masked data product. This can be downloaded at the link below:

https://eogdata.mines.edu/nighttime_light/annual/v21/2019/

In [None]:
# VIIRS data
src = rasterio.open(data_dir + 'raw/VIIRS/VNL_v21_npp_2019_global_vcmslcfg_c202205302300.average_masked.dat.tif')

In [None]:
data = src.read(1)

### bin options with percentiles

In [None]:
vals = data.copy().flatten()
vals[vals<=0] = np.nan
percentiles = np.linspace(0,100,21)
thresholds = np.nanpercentile(vals, percentiles)
percentile_bins = np.hstack([data.min(), thresholds])

In [None]:
percentile_bins

In [4]:
outpath = (data_dir+
           "int/VIIRS/VIIRS_feature_bin_widths.p")
pickle.dump(percentile_bins, open(outpath, "wb"))
percentile_bins = pickle.load(open(outpath, "rb"))

In [None]:
plt.hist(np.log(vals.flatten())) ## Note that when we exclude 0s and lower, we see a ~log normal distribution

In [None]:
#compare to distribution of approximately logged viirs raster
plt.hist(np.arcsinh(data.flatten()))

### Now make a population raster that is identical in shape

In [None]:
def resize_raster_to_match_another_raster(src_to_resize,src_to_match, savepath = None,
                                          resampling_method=rasterio.enums.Resampling.bilinear,
                                         only_positive_values_in_output=True):
    
    """
    Function to resample a raster data product such that is the same size as another raster product. 
    
    If `savepath` is provided, then the resampled raster is written to the disk.
    
    Returns np.array for the resampled raster product.
    
    For the purpose of creating weights, we sometimes only want to have positive values in the ouput product.
    This option is included. Zeros, negatives, and no data values are replaced with the smallest postitive value.
    
    """
    
    assert src_to_resize.crs == src_to_match.crs, "not tested for use case where CRSs don't match"
        
    data_matched = src_to_resize.read(
    out_shape=(src_to_match.count, src_to_match.height, src_to_match.width),
    #resampling=rasterio.enums.Resampling.average
    resampling=resampling_method,
    boundless=True,
    fill_value = np.nan,
    window = rasterio.windows.from_bounds(*list(src_to_match.bounds), transform=src_to_resize.transform)
    )
    
    ## Replace no data in src_to_resize with np.nan. More stable.
    data_matched[data_matched == src_to_resize.nodata] = np.nan
    
    if only_positive_values_in_output:
        data_matched[np.isnan(data_matched)] = np.min(data_matched[data_matched>0])
        data_matched[data_matched<=0] = np.min(data_matched[data_matched>0])
    
    if savepath:
        print("saving file...")
        with rasterio.open(savepath,
        'w',
        driver='GTiff',
        height=src_to_match.height,
        width=src_to_match.width,
        count=src_to_resize.count,
        dtype=src_to_resize.meta["dtype"],
        nodata = np.nan,
        crs=src_to_match.crs,
        transform=src_to_match.transform,
        ) as dst:
            dst.write(data_matched)
                
    return data_matched
        

In [None]:
src_pop = rasterio.open(data_dir + "/raw/GHS_pop/GHS_POP_E2020_GLOBE_R2023A_4326_30ss_V1_0.tif")

In [None]:
data_pop_match = resize_raster_to_match_another_raster(src_pop,src, 
                                      savepath =data_dir + "int/VIIRS/GHS_pop_density_match_viirs_rasterio_method.tif",
                                          resampling_method=rasterio.enums.Resampling.bilinear,
                                         only_positive_values_in_output=True)

# total population no longer matches, but it shouldn't. 
# Since we  are giving the same "population weights" to many different VIIRS cells. 
# And that doesn't need to logically sum to the expected global population

In [None]:
src_pop_match = rasterio.open(data_dir + "int/VIIRS/GHS_pop_density_match_viirs_rasterio_method.tif")

## Now make GDL polygon features

In [None]:
gpdf = pd.read_pickle(data_dir+"int/GDL_HDI/HDI_ADM1_shapefile_clean.p")

In [None]:
percentile_feats = nl_helpers.create_nl_binned_dataframe(gpdf, src, bins = percentile_bins, 
                                                        weight_raster = src_pop_match)
percentile_feats.to_pickle(data_dir+f"features/nl_features/GDL_HDI_polygons/viirs_percentile_binned_feats_GHS_pop_weighted_rasterio_method.p")



In [None]:
## ADM0

In [None]:
gpdf_adm0 = pd.read_pickle(data_dir+"int/GDL_HDI/HDI_ADM0_dissolved_shapefile.p")

In [None]:
percentile_feats_adm0 = nl_helpers.create_nl_binned_dataframe(gpdf_adm0, src, bins = percentile_bins, 
                                                        weight_raster = src_pop_match)
percentile_feats_adm0.to_pickle(data_dir+f"features/nl_features/GDL_ADM0_polygons/viirs_adm0_percentile_binned_feats_GHS_pop_weighted_rasterio_method.p")

## Now make DHS polygon features

In [None]:
dhs_shp = pd.read_pickle(data_dir+"int/GDL_IWI/DHS_iwi_cluster_buffers_clipped.p").set_index("DHSID")

In [None]:
out = nl_helpers.create_nl_binned_dataframe(dhs_shp, bins=percentile_bins,raster_file=src, 
                                            weight_raster =src_pop_match)
out = nl_helpers.correct_nl_df_creation(out, dhs_shp,bins=percentile_bins,raster_file=src)

assert out.isnull().sum().sum() == 0

out.to_pickle(data_dir+"features/nl_features/DHS_polygons/viirs_dhs_cluster_percentile_binned_feats_GHS_pop_weighted_rasterio_method.p")


## Now make ADM2 polygon features

In [None]:
file = data_dir + "raw/geoBoundaries/geoBoundariesCGAZ_ADM2.geojson"
adm2 = gpd.read_file(file).set_index("shapeID")

In [None]:
out = nl_helpers.create_nl_binned_dataframe(adm2, bins=percentile_bins,raster_file=src, 
                                            weight_raster =src_pop_match, 
                                           )
out = nl_helpers.correct_nl_df_creation(out, adm2,bins=percentile_bins,raster_file=src)

assert out.isnull().sum().sum() == 0

out.to_pickle(data_dir + "features/nl_features/geoBoundaries_ADM2/viirs_geoBoundaries_ADM2_percentile_binned_feats_GHS_pop_weighted_rasterio_method.p")


# Now make the Y values

### ADM2

In [None]:
file = data_dir + "raw/geoBoundaries/geoBoundariesCGAZ_ADM2.geojson"
adm2 = gpd.read_file(file).set_index("shapeID").rename(columns = {"ADM1_shapeID": "ADM1_shape"})


In [None]:
out_adm2 = nl_helpers.get_avg_nl_and_weighted_avg_nl(adm2, raster_file=src,
                                                     weight_raster = src_pop_match,
                                                     log1_raster=True)
out_adm2 = nl_helpers.correct_nl_Ys(out_adm2, adm2, raster_file=src,off_raster_val=np.nan,
                                   log1_raster=True)

#assert out_adm2.isnull().sum().sum() == 0 # Allow NAs when geometries are not on the viirs raster


out_adm2.to_pickle(data_dir + "int/VIIRS/viirs_avg_nl_Ys_geoB_adm2_GHS_pop.p")

### ADM1

In [None]:
adm2 = adm2[adm2["geometry"].is_valid] # Drop 3 obs, invalid geometries.
adm1_geoboundaries = adm2.dissolve("ADM1_shape") #

out_adm1 = nl_helpers.get_avg_nl_and_weighted_avg_nl(adm1_geoboundaries, raster_file=src,
                                                     weight_raster = src_pop_match,
                                                     log1_raster=True)
out_adm1 = nl_helpers.correct_nl_Ys(out_adm1, adm1_geoboundaries, raster_file=src,off_raster_val=np.nan,
                                   log1_raster=True)

#assert out_adm1.isnull().sum().sum() == 0 # Allow NAs when geometries are not on the viirs raster

out_adm1.to_pickle(data_dir + "int/VIIRS/viirs_avg_nl_Ys_geoB_adm1_GHS_pop.p")

### ADM0

In [None]:
adm0_geoboundaries = adm2.dissolve("shapeGroup")
out_adm0 = nl_helpers.get_avg_nl_and_weighted_avg_nl(adm0_geoboundaries, raster_file=src,
                                                     weight_raster = src_pop_match,
                                                    log1_raster=True)
out_adm0 = nl_helpers.correct_nl_Ys(out_adm0, adm0_geoboundaries, raster_file=src,off_raster_val=np.nan,
                                   log1_raster=True)

assert out_adm0.isnull().sum().sum() == 0 # Allow NAs when geometries are not on the viirs raster

out_adm0.to_pickle(data_dir + "int/VIIRS/viirs_avg_nl_Ys_geoB_adm0_GHS_pop.p")