In [1]:
from dataset_builder import EEDatasetBuilder
import geemap
import ee
import os
import numpy as np
import rasterio
import dask.array as da

In [2]:
# Trigger the authentication flow.
ee.Authenticate()


Successfully saved authorization token.


In [3]:
# Initialize the library.
ee.Initialize()

In [4]:
Map = geemap.Map(center=[ 25.239, 82.396], zoom=9)
Map

Map(center=[25.239, 82.396], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(child…

# This first part is to create the image file needed to identify the Naalas, not clip the area for the ganga river!!

In [5]:
# Build ee dataset builder
ee_dataset_builder = EEDatasetBuilder()

In [6]:
# buffer zone for ganga river
ganga_buff = ee.FeatureCollection('projects/nikhilrajdeep/assets/Buffer_Ganga_river')

In [7]:
subsection_clip = ee.FeatureCollection("projects/ee-warnermichael09/assets/gridded_30m_subsection")

In [8]:
subset_naalas = ee.Image('projects/ee-warnermichael09/assets/naalas_masked_30m').clip(subsection_clip)

In [9]:
Map.addLayer(subset_naalas,{} , 'naalas masked')

In [10]:
# Need to upload the masked naala area
ee_dataset_builder.filtered_response_layer_from_raster(
    response_raster="custom", 
    ee_image=subset_naalas,
    custom_response_raster_name='response'
)

ee.Image({
  "functionInvocationValue": {
    "functionName": "Image.rename",
    "arguments": {
      "input": {
        "functionInvocationValue": {
          "functionName": "Image.clip",
          "arguments": {
            "geometry": {
              "functionInvocationValue": {
                "functionName": "Collection.loadTable",
                "arguments": {
                  "tableId": {
                    "constantValue": "projects/ee-warnermichael09/assets/gridded_30m_subsection"
                  }
                }
              }
            },
            "input": {
              "functionInvocationValue": {
                "functionName": "Image.load",
                "arguments": {
                  "id": {
                    "constantValue": "projects/ee-warnermichael09/assets/naalas_masked_30m"
                  }
                }
              }
            }
          }
        }
      },
      "names": {
        "constantValue": [
          "response"
      

PD
LULC
Prec
DD
Slope
LST #land surface temp

### Local way of setting up the geoTIFF files

In [11]:
#out_dir = os.path.expanduser(r'\Data\geotiffs')

#if not os.path.exists(out_dir):
#    os.makedirs(out_dir)

#PD = os.path.join(r"Data\geotiffs\pop_bucket.tif")
#LULC = os.path.join(r"C:\Users\warne\Desktop\Projects\naalas_2\Data\geotiffs\lulcCond.tif")
#Prec = os.path.join(r'C:\Users\warne\Desktop\Projects\naalas_2\Data\geotiffs\rainCond.tif')
#DD = os.path.join(r"Data\geotiffs\ddCond.tif")
#Slope = os.path.join(r'Data\geotiffs\slopeCond.tif')
#LST = os.path.join(r'Data\geotiffs\tempCond.tif')



In [12]:
#Population Density
#Map.add_raster(PD, layer_name = "Population Density")
#Map.add_raster(LULC, layer_name = "Land use, Land cover")
#Map.add_raster(Prec, layer_name = "Rainfall")
#Map.add_raster(DD, layer_name = "Drainage Density")
#Map.add_raster(Slope, layer_name = "Slope")
#Map.add_raster(LST, layer_name = "Land Surface Temp")

#### Online way of setting up datasets through GEE

##### Chirps is the rain fall dataset

In [13]:
# Loading of the data sets
merit_water = ee.Image('MERIT/Hydro/v1_0_1').rename("hydro");
ele = ee.Image("NASA/NASADEM_HGT/001").rename("ele");
twi = ee.Image("projects/nikhilrajdeep/assets/TWI_Ganga");
dd = ee.Image("projects/nikhilrajdeep/assets/drainage_density").rename("dd");
chirps = ee.ImageCollection("UCSB-CHG/CHIRPS/PENTAD");
l8 = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2");
lc = ee.ImageCollection("ESA/WorldCover/v200");
soil = ee.Image("OpenLandMap/SOL/SOL_TEXTURE-CLASS_USDA-TT_M/v02");
pop = ee.ImageCollection("CIESIN/GPWv411/GPW_UNWPP-Adjusted_Population_Density");

In [14]:
base = []

In [15]:
base = merit_water.rename("hydro").clip(ganga_buff)

In [16]:
base = base.addBands(dd).rename("dd").clip(ganga_buff)
base = base.addBands(ele).rename("ele").clip(ganga_buff)

In [17]:

base = base.addBands(twi).rename("twi").clip(ganga_buff)
base = base.addBands(chirps).rename("chirps").clip(ganga_buff)
base = base.addBands(l8).rename("l8").clip(ganga_buff)
base = base.addBands(lc).rename("1c").clip(ganga_buff)
base = base.addBands(soil).rename("soil").clip(ganga_buff)
base = base.addBands(pop).rename("pop").clip(ganga_buff)

In [18]:
base

In [19]:
merit_water = ee.Image('MERIT/Hydro/v1_0_1').clip(subsection_clip);
ele = ee.Image("NASA/NASADEM_HGT/001");
twi = ee.Image("projects/nikhilrajdeep/assets/TWI_Ganga");
dd = ee.Image("projects/nikhilrajdeep/assets/drainage_density");
chirps = ee.ImageCollection("UCSB-CHG/CHIRPS/PENTAD");
l8 = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2");
lc = ee.ImageCollection("ESA/WorldCover/v200");
soil = ee.Image("OpenLandMap/SOL/SOL_TEXTURE-CLASS_USDA-TT_M/v02");
pop = ee.ImageCollection("CIESIN/GPWv411/GPW_UNWPP-Adjusted_Population_Density");

# Population Density

In [20]:
gpop = pop.toBands().select('gpw_v4_population_density_adjusted_to_2015_unwpp_country_totals_rev11_2020_30_sec_unwpp-adjusted_population_density').clip(subsection_clip).rename('popDensity');

In [21]:
gpop_cond = gpop.expression(
    "(b('popDensity') > 0 && b('popDensity') < 865) ? 1" +
    ":(b('popDensity') >= 865 && b('popDensity') < 1600) ? 2" +
    ":(b('popDensity') >= 1600 && b('popDensity') < 2680) ? 3" +
    ":(b('popDensity') >= 2680 && b('popDensity') < 4036) ? 4" +
    ":(b('popDensity') >= 4036) ? 5" +
    ": 0"  # Default value in case none of the conditions above are met
).clip(subsection_clip)

In [22]:
conditionParams = {
    'min': 1,
    'max': 5,
    'palette': ['1a9641', 'a6d96a', 'ffffbf', 'fdae61', 'd7191c']
}

In [23]:
Map.addLayer(gpop_cond, conditionParams, 'Population density')

In [24]:
predictors = gpop_cond.rename('PD')

# Land Use Land Cover, LULC

In [25]:
lulc = lc.first().clip(subsection_clip).rename('LULC')

In [26]:
lulcCond = (lulc.eq(80).multiply(5)
            .where(lulc.eq(90), 1)
            .where(lulc.eq(10), 1)
            .where(lulc.eq(20), 1)
            .where(lulc.eq(30), 1)
            .where(lulc.eq(40), 4)
            .where(lulc.eq(60), 2)
            .where(lulc.eq(50), 3))

In [27]:
Map.addLayer(lulcCond, conditionParams, 'LULC');

In [28]:
predictors = predictors.addBands(lulcCond)

# Precipitation, chirps dataset




In [29]:
rain = chirps.filterDate('2022-01-01','2023-01-01').sum().clip(subsection_clip)

In [30]:
rainCond = (rain
  .where(rain.lt(868), 1)
  .where(rain.gte(868)and(rain.lt(1019)), 2)
  .where(rain.gte(1019)and(rain.lt(1189)), 3)
  .where(rain.gte(1189)and(rain.lt(1410)), 4)
  .where(rain.gte(1410), 5))


In [31]:
Map.addLayer(rainCond, conditionParams, 'Rainfall Categorized');

In [32]:
predictors = predictors.addBands(rainCond)

# Drainage Density DD

Drainage density

| DD         | Condition     | 

| <0.53      | Extreme low   |
 
 | 0.53-1.43  | low           |
 
 | 1.43-2.57  | Moderate      |
 
 | 2.57-4     | high          |
 
 | >4         | Extreme high  |

In [33]:
ddCond = (dd
  .where(dd.lt(0.6), 1)
  .where(dd.gte(0.6)and(dd.lt(1.5)), 2)
  .where(dd.gte(1.5)and(dd.lt(2.6)), 3)
  .where(dd.gte(2.6)and(dd.lt(4)), 4)
  .where(dd.gte(4), 5)).rename('drainage_density').clip(subsection_clip)

In [34]:
Map.addLayer(ddCond, conditionParams, 'drainage density');

In [35]:
predictors = predictors.addBands(ddCond)

# Slope

In [36]:
dem = ele.select('elevation')
slope = ee.Terrain.slope(dem).clip(subsection_clip)

In [37]:
slopeCond = (slope
  .where(slope.lt(4), 5)
  .where(slope.gte(4)and(slope.lt(12)), 4)
  .where(slope.gte(12)and(slope.lt(24)), 3)
  .where(slope.gte(24)and(slope.lt(35)), 2)
  .where(slope.gte(35), 1))

In [38]:
Map.addLayer(slopeCond, conditionParams, 'slope');

In [39]:
predictors = predictors.addBands(slopeCond)

In [40]:
predictors

# NDVI, needed for LST

In [41]:
def maskL8sr(image):
    qaMask = image.select('QA_PIXEL').bitwiseAnd(int('11111', 2)).eq(0)
    saturationMask = image.select('QA_RADSAT').eq(0)

    opticalBands = image.select('SR_B.').multiply(0.0000275).add(-0.2)
    thermalBands = image.select('ST_B.*').multiply(0.00341802).add(149.0)

    return image.addBands(opticalBands, None, True)\
                .addBands(thermalBands, None, True)\
                .updateMask(qaMask)\
                .updateMask(saturationMask)

In [42]:
def addindices(image):
    ndvi = image.normalizedDifference(['SR_B5', 'SR_B4']).rename('NDVI')
    return image.addBands(ndvi)


In [43]:
ndvi = (l8.filterDate('2022-01-01', '2023-01-01')
        .filterBounds(ganga_buff)
        .filterMetadata('CLOUD_COVER', 'less_than', 10)
        .map(maskL8sr)
        .map(addindices)
        .select('NDVI')
        .mean()
        .clip(ganga_buff))

ndviCond = (ndvi.where(ndvi.lt(0.14), 5)
           .where(ndvi.gte(0.14)and(ndvi.lt(0.33)), 4)
           .where(ndvi.gte(0.33)and(ndvi.lt(0.44)), 3)
           .where(ndvi.gte(0.44)and(ndvi.lt(0.58)), 2)
           .where(ndvi.gte(0.58), 1))


# Land Surface Temperature LST

In [44]:
temp = (l8.filterDate('2021-01-01', '2022-01-01')
        .filterBounds(subsection_clip)
        .filterMetadata('CLOUD_COVER', 'less_than', 10)
        .map(maskL8sr)
        .select('ST_B10')
        .map(lambda image: image.subtract(273.15))
        .mean()
        .clip(subsection_clip)).rename('LST')

In [45]:
tempCond = (temp
  .where(temp.lt(25), 1)
  .where(temp.gte(25)and(temp.lt(30)), 2)
  .where(temp.gte(30)and(temp.lt(34)), 3)
  .where(temp.gte(34)and(temp.lt(37)), 4)
  .where(temp.gte(37), 5))

In [46]:
Map.addLayer(tempCond, conditionParams, 'LST 2022');

In [47]:
predictors = predictors.addBands(tempCond)

# Merit - Hydro NOT IMPLEMENTED

In [48]:
predictors.bandNames().getInfo()

['PD', 'LULC', 'precipitation', 'drainage_density', 'slope', 'LST']

In [49]:
name_custom_ee_images_list = predictors.bandNames().getInfo()
ee_images_list = [predictors.select(band_name) for band_name in name_custom_ee_images_list]
predictors_list = ['custom_ee_image' for x in name_custom_ee_images_list]

In [50]:
ee_dataset_builder.spatial_covariates(covariates=predictors_list, 
                                      ee_image=ee_images_list,
                                      name_custom_ee_image=name_custom_ee_images_list ) 

In [51]:
ee_dataset_builder.image.bandNames().getInfo()

['response', 'PD', 'LULC', 'precipitation', 'drainage_density', 'slope', 'LST']

# Export samples CSV to GCP

In [52]:
start_index = 0
end_index = 10  # example range
nb_features, list_features_assets = ee_dataset_builder.load_ee_asset_shapefile('projects/ee-warnermichael09/assets/gridded_30m_subsection', start_index, end_index)

if nb_features is None:
    print("Failed to load the subset of the Earth Engine asset.")
else:
    print(f"Subset loaded {nb_features, list_features_assets} successfully.")

Subset loaded (6007, [{'type': 'Feature', 'geometry': {'type': 'LineString', 'coordinates': [[81.85803238699944, 25.601521988089], [81.86947019398383, 25.60145442179845]]}, 'id': '00000000000000000000', 'properties': {'bottom': 2831834.99417558, 'id': 43, 'left': 586159.4528034591, 'right': 702828.3092464125, 'top': 2831834.99417558}}, {'type': 'Feature', 'geometry': {'type': 'LineString', 'coordinates': [[81.8580304677284, 25.601253066464743], [81.87395635609175, 25.601158744333695]]}, 'id': '00000000000000000001', 'properties': {'bottom': 2831804.99417558, 'id': 44, 'left': 586159.4528034591, 'right': 702828.3092464125, 'top': 2831804.99417558}}, {'type': 'Feature', 'geometry': {'type': 'LineString', 'coordinates': [[81.85802851649758, 25.600979662803127], [81.87684592540604, 25.60086803049321]]}, 'id': '00000000000000000002', 'properties': {'bottom': 2831774.99417558, 'id': 45, 'left': 586159.4528034591, 'right': 702828.3092464125, 'top': 2831774.99417558}}, {'type': 'Feature', 'geo

In [53]:
# Gridded shapefile asset in GEE
# Ganga 150m^2 grid file
shp_asset_path = 'projects/ee-warnermichael09/assets/gridded_30m_subsection'

# Load the feature collection
feature_collection = ee.FeatureCollection(shp_asset_path)

scale = 30
maxPixels = 1e13
gcp_bucket = 'ganga-lab'
gcp_folder_name = 'naala_identification'
numPoints = 1000 # we override the number of points below with classPoints
classBand = "response"
classPoints = [4000, 1000]
classValues = [0,1]
batch_size = 500
total_features = feature_collection.size().getInfo()
total_features

6007

In [54]:
# Calculate total number of batches
total_batches = (total_features + batch_size - 1) // batch_size
total_batches

13

In [55]:
#ee_dataset_builder.main_processing_pipeline(shp_asset_path, chunk_size=50000)  # example chunk size


In [56]:
print("Total features in asset:", total_features)
print("Total batches:", total_batches)


Total features in asset: 6007
Total batches: 13


In [60]:
list_features_assets, _ = ee_dataset_builder.load_ee_asset_shapefile(
    shp_asset_path, 
    start_index=start_index, 
    end_index=end_index
)

print("Type of list_features_assets:", type(list_features_assets))
print("First few elements in list_features_assets:", list_features_assets[:2])


Type of list_features_assets: <class 'int'>
First few elements in list_features_assets: 6007


In [62]:
samples_folder_name = f'naala_mapping_gridded_150^2_classPoints_4000_1000'



# Assuming these variables are already defined:
# total_features, total_batches, batch_size, shp_asset_path, gcp_bucket, gcp_folder_name, scale, 
# geometries, isStratifiedSampling, numPoints, classValues, classBand, classPoints, samples_folder_name

print("Total features in asset:", total_features)
print("Total batches:", total_batches)

# Loop through each batch and process
for batch_number in range(total_batches):
    start_index = batch_number * batch_size
    end_index = min((batch_number + 1) * batch_size, total_features)
    print(f"Loading batch {batch_number}: start_index = {start_index}, end_index = {end_index}")

    # Load the batch of features
    nb_features, list_features_assets = ee_dataset_builder.load_ee_asset_shapefile(
        shp_asset_path, 
        start_index=start_index, 
        end_index=end_index
    )
    if list_features_assets is None:
        print("Failed to load features for batch", batch_number)
        continue
    
    # Check if list_features_assets is a list
    if isinstance(list_features_assets, list):
        ee_dataset_builder.samples_csv_export(
            shp_asset_path=shp_asset_path, 
            list_features_assets=list_features_assets,
            name_gcp_bucket=gcp_bucket, 
            folder_in_gcp_bucket=gcp_folder_name + '/' + samples_folder_name, 
            scale=scale, 
            geometries=True,
            isStratifiedSampling=True, 
            numPoints=numPoints, 
            classValues=classValues,
            classBand=classBand, 
            classPoints=classPoints,
            batch_size=batch_size,
            batch_number=batch_number,
            total_features=total_features
        )
    else:
        print("Failed to load features for batch", batch_number)


Total features in asset: 6007
Total batches: 13
Loading batch 0: start_index = 0, end_index = 500
Stratified sampling: 
numPoints: 1000, 
classBand: response, 
scale: 30, 
geometries: True, 
dropNulls: True, 
tileScale: 1, 
classPoints: [4000, 1000], 
seed: 0, 
projection:None
Batch number: 0, Start index: 0, End index: 500, Total features: [{'type': 'Feature', 'geometry': {'type': 'LineString', 'coordinates': [[81.85803238699944, 25.601521988089], [81.86947019398383, 25.60145442179845]]}, 'id': '00000000000000000000', 'properties': {'bottom': 2831834.99417558, 'id': 43, 'left': 586159.4528034591, 'right': 702828.3092464125, 'top': 2831834.99417558}}, {'type': 'Feature', 'geometry': {'type': 'LineString', 'coordinates': [[81.8580304677284, 25.601253066464743], [81.87395635609175, 25.601158744333695]]}, 'id': '00000000000000000001', 'properties': {'bottom': 2831804.99417558, 'id': 44, 'left': 586159.4528034591, 'right': 702828.3092464125, 'top': 2831804.99417558}}, {'type': 'Feature', '

EEException: Invalid GeoJSON geometry.

In [None]:
# Test loading a small batch of features
nb_features, list_features_assets = ee_dataset_builder.load_ee_asset_shapefile(
    shp_asset_path, 
    start_index=0, 
    end_index=10
)

print("Number of features loaded:", nb_features)
if list_features_assets:
    print("Features loaded successfully")
else:
    print("Failed to load features")

Number of features loaded: 6007
Features loaded successfully


In [None]:
# Example usage
q = ee_dataset_builder.chunk_raster_data("Data\geotiffs\woCond (1).tif", chunk_size=(512, 512))

AttributeError: 'EEDatasetBuilder' object has no attribute 'chunk_raster_data'