<a href="https://colab.research.google.com/github/LasiJaya24/RS_team_collaboration24/blob/main/MNDWI_Timeseries_Batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import ee
import geopandas as gpd
import math
from pathlib import Path
import geopandas as gpd
import json
import pandas as pd
import numpy as np
import os
import ee
import datetime
import sys
from google.colab import drive

In [None]:
### Feature collection and output folder
features = 'projects/remote-sensing-420704/assets/Waterbodies_qld_constructed'
output = '/remote-sensing-420704/assets/output'
drive =  '/remote-sensing-420704'

### Date range
# Start of analysis period, YYYY-MM-DD
sDate = "2024-01-01"
# End of analysis period, YYYY-MM-DD
eDate = "2024-01-31"

### Other
# The Field with the identifier
uniqueid_field = "pfi"
# The most appropriate UTM zone for spatial analysis
utmZoneInfo = "EPSG:28356"#z56
#utmZoneInfo = "EPSG:28355"#z55
#utmZoneInfo = "EPSG:28354"#z54
#Index threshold Values for area (greater than or equal to)
mdwiThresh = 0.0
# Cloud filter percentage eg. 60%
CLOUD_FILTER = 60
# Other variables
CLD_PRB_THRESH = 50
NIR_DRK_THRESH = 0.15
CLD_PRJ_DIST = 1
BUFFER = 50
BUFFERSMALL = 5
# Create variables for field names
dateString = "date"
totAreaString = "total_area"
mdwiAreaString = "mdwi_area"
mdwiAvgString = "mdwi_avg"
cloudAreaString = "cloud_area"
mdwiPercString = "mdwi_perc"
cloudPercString = "cloud_perc"

In [None]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize(project=drive)

In [None]:
# Mount the google drive
drive.mount('/remote-sensing-420704')

Drive already mounted at /remote-sensing-420704; to attempt to forcibly remount, call drive.mount("/remote-sensing-420704", force_remount=True).


In [None]:
# Function to calculate MNDWI.
def calculate_mndwi(image):
    return image.normalizedDifference(['green', 'swir1']).rename('MNDWI')

# Function to iterate over the ImageCollection for a single feature and reduce it to a list of MNDWI values.
def feature_mndwi_timeseries(feature):
    def reduce_image(image):
        reduction = image.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=feature.geometry(),
            scale=30 # Adjust scale if necessary for your application
        )
        return ee.Feature(None, {
            'MNDWI': reduction.get('MNDWI'),
            'date': image.date().format()
        })
    timeseries = mndwi_collection.map(reduce_image)
    return timeseries.flatten()

# Function to export the results to Google Drive.
def export_to_drive(feature_collection, batch_index):
    task = ee.batch.Export.table.toDrive(
        collection=feature_collection,
        description=f'MNDWI_Timeseries_Batch_{batch_index}',
        folder='GEE_MNDWI_Timeseries',
        fileNamePrefix=f'mndwi_timeseries_batch_{batch_index}',
        fileFormat='CSV'
    )
    task.start()

def get_s2_sr_cld_col(aoi, start_date, end_date):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR')
        .filterBounds(aoi.geometry())
        .filterDate(start_date, end_date))
        #.filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', CLOUD_FILTER)))

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi.geometry())
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    return ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(**{
        'primary': s2_sr_col,
        'secondary': s2_cloudless_col,
        'condition': ee.Filter.equals(**{
            'leftField': 'system:index',
            'rightField': 'system:index'
        })
    }))

def add_required_bands(img):
    mndwi = img.normalizedDifference(['B3','B12']).rename('mndwi')

    #Add our bands to the original collection
    return img.addBands(ee.Image(mndwi))

def add_cloud_bands(img):
    # Get s2cloudless image, subset the probability band.
    cld_prb = ee.Image(img.get('s2cloudless')).select('probability')

    # Condition s2cloudless by the probability threshold value.
    is_cloud = cld_prb.gt(CLD_PRB_THRESH).rename('clouds')

    # Add the cloud probability layer and cloud mask as image bands.
    return img.addBands(ee.Image([cld_prb, is_cloud]))

def add_shadow_bands(img):
    # Identify water pixels from the SCL band.
    not_water = img.select('SCL').neq(6)

    # Identify dark NIR pixels that are not water (potential cloud shadow pixels).
    SR_BAND_SCALE = 1e4
    dark_pixels = img.select('B8').lt(NIR_DRK_THRESH*SR_BAND_SCALE).multiply(not_water).rename('dark_pixels')

    # Determine the direction to project cloud shadow from clouds (assumes UTM projection).
    shadow_azimuth = ee.Number(90).subtract(ee.Number(img.get('MEAN_SOLAR_AZIMUTH_ANGLE')));

    # Project shadows from clouds for the distance specified by the CLD_PRJ_DIST input.
    cld_proj = (img.select('clouds').directionalDistanceTransform(shadow_azimuth, CLD_PRJ_DIST*10)
        .reproject(**{'crs': img.select(0).projection(), 'scale': 100})
        .select('distance')
        .mask()
        .rename('cloud_transform'))

    # Identify the intersection of dark pixels with cloud shadow projection.
    shadows = cld_proj.multiply(dark_pixels).rename('shadows')

    # Add dark pixels, cloud projection, and identified shadows as image bands.
    return img.addBands(ee.Image([dark_pixels, cld_proj, shadows]))

def add_cld_shdw_mask(img):
    # Add the required bands to the image
    addReqBands = add_required_bands(img)

    # Add cloud component bands.
    img_cloud = add_cloud_bands(addReqBands)

    # Add cloud shadow component bands.
    img_cloud_shadow = add_shadow_bands(img_cloud)

    # Combine cloud and shadow mask, set cloud and shadow as value 1, else 0.
    is_cld_shdw = img_cloud_shadow.select('clouds').add(img_cloud_shadow.select('shadows')).gt(0).rename('cloudandshadow')

    #Rob thinks this is cutting out too much cloud....
    # Remove small cloud-shadow patches and dilate remaining pixels by BUFFER input.
    # 20 m scale is for speed, and assumes clouds don't require 10 m precision.
    is_cld_shdw_FB = (is_cld_shdw.focalMin(2).focalMax(BUFFER*2/20)
        .reproject(**{'crs': img.select([0]).projection(), 'scale': 20}))

    is_cld_shdw_Buff = (is_cld_shdw.focalMax(BUFFERSMALL*2/20)
        .reproject(**{'crs': img.select([0]).projection(), 'scale': 20}))

    is_cld_shdw_FB = is_cld_shdw_FB.rename('cloudandshadowfiltbuff')
    is_cld_shdw_Buff = is_cld_shdw_Buff.rename('cloudandshadowbuff')

    # Add the final cloud-shadow mask to the image.
    return img_cloud_shadow.addBands(ee.Image([is_cld_shdw, is_cld_shdw_FB, is_cld_shdw_Buff]))

def calc_stats_from_layers(col, AOI, json_storages, utmZoneInfo, mndwiMaskVal):
    # Convert the mask value to a float
    mndwiMaskVal = float(mndwiMaskVal)

    # Mosaic the image collection.
    print("Mosaicing images")
    img = col.mosaic()

    # Subset the cloudandshowbuff layer and create a mask where values are greater than 0
    cloudandshadowbuff = img.select('cloudandshadowbuff').selfMask()

    # Create a mask layer from the MNDWI
    mndwi = img.select('mndwi')#.reproject(crs=utmZoneInfo['crs'], scale=utmZoneInfo['scale'])
    mndwiMaskLayer = mndwi.gte(mndwiMaskVal).rename('mndwiMaskLayer')

    # Perform reprojections
    print("Clipping images")
    mndwiProj = mndwi.clip(AOI)#.reproject(crs=utmZoneInfo, scale=10)#Chosen 10 metres, , crsTransform='null'
    mndwiMaskedProj = mndwiMaskLayer.clip(AOI)#.reproject(crs=utmZoneInfo, scale=10)
    cloudandshadowbuffProj = cloudandshadowbuff.clip(AOI)#.reproject(crs=utmZoneInfo, scale=10)

    #This will now give each cell in the mask area 1 * pixel area
    #Need one complete raster for total area stats calc
    print("Calculating area")
    allStatsLayer = mndwiProj.gte(-50.0).multiply(ee.Image.pixelArea()).rename('allPixels')
    mndwiStatsLayer = mndwiMaskedProj.multiply(ee.Image.pixelArea()).rename('mndwiStatsLayer')
    cloudStatsLayer = cloudandshadowbuffProj.multiply(ee.Image.pixelArea()).rename('cloudStatsLayer')

    # Create a collection of features
    #theCollection = ee.FeatureCollection(json_storages)

    # Calculate statistics using the reducer
    print("Calculating stats")
    allStats = allStatsLayer.reduceRegions(**{'reducer': ee.Reducer.sum(),'crs':utmZoneInfo, 'scale': 10,'collection': col}).getInfo()['features']
    mndwiStats = mndwiStatsLayer.reduceRegions(**{'reducer': ee.Reducer.sum(),'crs':utmZoneInfo, 'scale': 10,'collection': col}).getInfo()['features']
    cloudStats = cloudStatsLayer.reduceRegions(**{'reducer': ee.Reducer.sum(),'crs':utmZoneInfo, 'scale': 10,'collection': col}).getInfo()['features']

    #Send back a dictionary of JSON objects, these string identifiers will need to be match in the script that calls this function
    return {'allAreas':allStats,'mndwiAreas':mndwiStats,'cloudAreas':cloudStats}

def retrieveStatsFromJSON(theResultsDict, theIDField, theMetric, theDataPropertyName):
    # Create a dataframe to store results
    geeStats = pd.DataFrame(columns=[theIDField, theDataPropertyName])

    # Add each row to the dataframe
    for i in range(len(theResultsDict)):
        theID = theResultsDict[i]["properties"][theIDField]
        dataRow = [theID, -9999]

        # Apply the calculation if required eg. Sum
        if theMetric in theResultsDict[i]["properties"]:
            dataRow = [theID, theResultsDict[i]["properties"][theMetric]]

        # Add the results to the dataframe row
        geeStats.loc[len(geeStats)] = dataRow

    return geeStats

def process_batch(batch):
    # Convert the batch to a list of ee.Features.
    features = []
    for _, row in batch.iterrows():
        geom = ee.Geometry.Polygon(row['geometry'].exterior.coords)
        feature = ee.Feature(geom, row.drop('geometry').to_dict())
        features.append(feature)

    # Create a FeatureCollection from the features list.
    batch_feature_collection = ee.FeatureCollection(features)

    # Here you can add processing specific to the batch.
    # For example, printing the size of the batch.
    print('Processed batch with feature count:', batch_feature_collection.size().getInfo())

def batch_process_shapefile(gdf, batch_size):
    for index in range(0, len(gdf), batch_size):
        batch = gdf.iloc[index:index + batch_size]
        process_batch(batch)

In [None]:

# For a FeatureCollection (like a shapefile or table)
all_features = ee.FeatureCollection(features)

In [None]:
size = all_features.size().getInfo()
print(size)

21839


In [None]:
# Convert the dates to the correct format
overallStartDate = datetime.datetime.strptime(sDate,"%Y-%m-%d")
overallEndDate = datetime.datetime.strptime(eDate,"%Y-%m-%d")

# Count the number of days to process
delta = overallEndDate - overallStartDate
theDayCount = delta.days + 1
print("Total days:", theDayCount)

Total days: 31


In [None]:
basin = 'Burdekin'
query = 'drainage_b == "{}"'.format(basin)
print(query)

fc_server = all_features.filter(query)

size = fc_server.size().getInfo()
print(size)


# This will give bounds in expected GEE format
geeFeatureGeometry = ee.Geometry(fc_server.geometry())
AOI = geeFeatureGeometry.bounds()

drainage_b == "Burdekin"
2332


In [None]:
# Create an empty dataframe for results
tsStats = pd.DataFrame(columns=[uniqueid_field, dateString, totAreaString, mdwiAreaString, cloudAreaString, mdwiPercString, cloudPercString])

# Get current time
beginNow = datetime.datetime.now().strftime("%d/%m/%Y, %H:%M:%S")
print("Starting at " + beginNow)
print("")

# For each day
for i in range(theDayCount):

    # Get the date range
    theStartDate = overallStartDate + datetime.timedelta(days=i)
    theStart = datetime.datetime.strftime(theStartDate, "%Y-%m-%d")
    theEndDate = overallStartDate + datetime.timedelta(days=i+1)
    theEnd = datetime.datetime.strftime(theEndDate, "%Y-%m-%d")

    # Create a dataframe to store daily results
    dfCombo = pd.DataFrame()

    # Print the current time
    hereNow = datetime.datetime.now().strftime("%d/%m/%Y, %H:%M:%S")
    print("Processing period: " + str(theStart) + " at " + hereNow)

    # Create an image collection
    theImgColl = get_s2_sr_cld_col(fc_server, theStart, theEnd)
    print(int(theImgColl.size().getInfo()))
    numElements = theImgColl.size().getInfo()

    # If images are found for that day
    if numElements > 0:

        # Print the number of images found
        print("Images found: " + str(numElements))

        # Add cloud shadow mask
        print("Adding shadow mask")
        theImgCollExpanded = theImgColl.map(add_cld_shdw_mask)

        # Calculate stats from layers
        print("Calculating stats")
        statsDict = calc_stats_from_layers(theImgCollExpanded, geeFeatureGeometry, fc_server, utmZoneInfo, mdwiThresh)

        # Retrieve stats for allAreas
        print("Retrieving stats")
        dfAll = retrieveStatsFromJSON(statsDict["allAreas"], uniqueid_field, "sum", totAreaString)

        # Add date column
        dfAll[dateString] = theStart

        # Retrieve stats for MDWI and Clouds
        dfMDWI = retrieveStatsFromJSON(statsDict["mndwiAreas"], uniqueid_field, "sum", mdwiAreaString)
        dfCloud = retrieveStatsFromJSON(statsDict["cloudAreas"], uniqueid_field, "sum", cloudAreaString)

        # Merge all of the DF's together
        dfCombo = pd.merge(dfAll, dfMDWI, how="left", left_on=[uniqueid_field], right_on=[uniqueid_field])
        dfCombo = pd.merge(dfCombo, dfCloud, how="left", left_on=[uniqueid_field], right_on=[uniqueid_field])

        # Add the percentage fields
        dfCombo[mdwiPercString] = dfCombo.apply(lambda row: (row[mdwiAreaString] / row[totAreaString]) * 100 if row[totAreaString] != 0 else 0, axis=1)
        dfCombo[cloudPercString] = dfCombo.apply(lambda row: (row[cloudAreaString] / row[totAreaString]) * 100 if row[totAreaString] != 0 else 0, axis=1)

        # Add stats to the results dataframe
        if len(dfCombo) > 0:
            tsStats = pd.concat([tsStats, dfCombo])

        # Print the finish time for the day
        hereNow = datetime.datetime.now().strftime("%d/%m/%Y, %H:%M:%S")
        print("Done processing day: " + str(theStart) + " at " + hereNow)
        print("")

    else:
        # Print the finish time for the day
        print("No images found")
        print("")

# Save total results to CSV
file_path = folder_path + basin
tsStats.to_csv(file_path, index=False)

# Calculate time taken for completion
rightNow = datetime.datetime.now().strftime("%d/%m/%Y, %H:%M:%S")
print("Entire period processed, started at " + beginNow + " and finished at " + rightNow)

Starting at 04/06/2024, 06:04:07

Processing period: 2024-01-01 at 04/06/2024, 06:04:07


TypeError: can only concatenate str (not "module") to str