# Create training dataset

In [1]:
import ee
try:
    ee.Initialize()
except:
    ee.Authenticate()
    ee.Initialize()

import pandas as pd
from sklearn.cluster import DBSCAN
import geemap
import numpy as np
import geopandas as gpd
from geeml.utils import eeprint
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [3]:
# Get South Africa geometry
countries = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017")
geometry = ee.Geometry.Point([24.06353794842853, -29.732969740562062])
sa = countries.filterBounds(geometry)
sa_geo = sa.geometry()

# Get Landsat data
dataset = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')\
.filterBounds(sa_geo)\
.filterDate('2018-06-28', '2023-08-22')

# Applies scaling factors.
def applyScaleFactors(image):
    opticalBands = image.select('SR_B.').multiply(0.0000275).add(-0.2)
    thermalBands = image.select('ST_B.*').multiply(0.00341802).add(149.0)
    return image.addBands(opticalBands, None, True).addBands(thermalBands, None, True)

lS8 = dataset.map(applyScaleFactors)

In [4]:
df = pd.read_csv(r'C:/Users/coach/Documents/scratch/Post_doc/Fire/data/fire_2020_08_01.csv')
df = df.dropna()
risk_dict = {7692: 'Extreme', 7805:'High', 2903:'Medium', 6555:'Low'}
df['risk'] = df['fireRisk'].astype(int).map(risk_dict)

# Combine 'year' and 'day_of_year' columns to create a new datetime column
df['date'] = pd.to_datetime(df['BurnYear'].astype(str) + df['BurnDate'].astype(str), format='%Y%j')

df.head()

Unnamed: 0,id,BurnDate,BurnYear,fireRisk,x,y,risk,date
0,224879.0,231,2020,6555.0,27.119015,-33.452137,Low,2020-08-18
1,224879.0,233,2020,6555.0,27.121262,-33.452137,Low,2020-08-20
2,224879.0,231,2020,6555.0,27.119015,-33.449894,Low,2020-08-18
3,224879.0,231,2020,6555.0,27.121262,-33.449894,Low,2020-08-18
4,224879.0,231,2020,6555.0,27.11677,-33.447647,Low,2020-08-18


In [5]:
# https://stackoverflow.com/questions/34579213/dbscan-for-clustering-of-geographic-location-data
# https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
coords = df[['x', 'y']].values
db = DBSCAN(eps=2/6371., min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))

In [6]:
df['clusters'] = db.labels_
df['clusters'].value_counts()

311     3662
1325    2819
1244    1493
765     1437
108     1378
        ... 
1216       1
492        1
556        1
1074       1
1256       1
Name: clusters, Length: 1475, dtype: int64

In [7]:
# Group by 'group' column and calculate the required values
grouped = df.groupby('clusters').agg(
    start_date=('date', 'min'),
    end_date=('date', 'max'),
    duration=('date', lambda x: (x.max() - x.min()).days + 1)
).reset_index()

# Merge the grouped data back into the original dataframe
result_df = df.merge(grouped, on='clusters')

result_df.head()

Unnamed: 0,id,BurnDate,BurnYear,fireRisk,x,y,risk,date,clusters,start_date,end_date,duration
0,224879.0,231,2020,6555.0,27.119015,-33.452137,Low,2020-08-18,0,2020-08-13,2020-08-21,9
1,224879.0,233,2020,6555.0,27.121262,-33.452137,Low,2020-08-20,0,2020-08-13,2020-08-21,9
2,224879.0,231,2020,6555.0,27.119015,-33.449894,Low,2020-08-18,0,2020-08-13,2020-08-21,9
3,224879.0,231,2020,6555.0,27.121262,-33.449894,Low,2020-08-18,0,2020-08-13,2020-08-21,9
4,224879.0,231,2020,6555.0,27.11677,-33.447647,Low,2020-08-18,0,2020-08-13,2020-08-21,9


In [8]:
# Convert the DataFrame to a GeoDataFrame
geometry = gpd.points_from_xy(df['x'], df['y'])
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs = 'EPSG:4326')

In [10]:
cols = ['id', 'BurnDate', 'BurnYear', 'fireRisk', 'x', 'y', 'risk', 'date',
       'clusters', 'start_date', 'end_date', 'duration', 'img_id',
       'areakm2_BA', 'CO_column_number_density_p25_variance_p25',
       'CO_column_number_density_p25_variance_p5',
       'CO_column_number_density_p25_variance_p50',
       'CO_column_number_density_p25_variance_p75',
       'CO_column_number_density_p25_variance_p95',
       'CO_column_number_density_p25_variance_p99',
       'CO_column_number_density_p50_variance_p25',
       'CO_column_number_density_p50_variance_p5',
       'CO_column_number_density_p50_variance_p50',
       'CO_column_number_density_p50_variance_p75',
       'CO_column_number_density_p50_variance_p95',
       'CO_column_number_density_p50_variance_p99',
       'CO_column_number_density_p5_variance_p25',
       'CO_column_number_density_p5_variance_p5',
       'CO_column_number_density_p5_variance_p50',
       'CO_column_number_density_p5_variance_p75',
       'CO_column_number_density_p5_variance_p95',
       'CO_column_number_density_p5_variance_p99',
       'CO_column_number_density_p75_variance_p25',
       'CO_column_number_density_p75_variance_p5',
       'CO_column_number_density_p75_variance_p50',
       'CO_column_number_density_p75_variance_p75',
       'CO_column_number_density_p75_variance_p95',
       'CO_column_number_density_p75_variance_p99',
       'CO_column_number_density_p95_variance_p25',
       'CO_column_number_density_p95_variance_p5',
       'CO_column_number_density_p95_variance_p50',
       'CO_column_number_density_p95_variance_p75',
       'CO_column_number_density_p95_variance_p95',
       'CO_column_number_density_p95_variance_p99',
       'CO_column_number_density_p99_variance_p25',
       'CO_column_number_density_p99_variance_p5',
       'CO_column_number_density_p99_variance_p50',
       'CO_column_number_density_p99_variance_p75',
       'CO_column_number_density_p99_variance_p95',
       'CO_column_number_density_p99_variance_p99']

In [11]:
# Y
# For each fire duration (cluster),
# get spatio-temporally overlapping Landsat scenes,
# Get burn area extent for scene based on MODIS
# add MODIS burn area (km2) as property to landsat scene.

# Define a function to remove fires with confidence level < 50%
def conf_mask(img):
    conf = img.select('ConfidenceLevel')
    level = conf.gt(50)
    return img.updateMask(level).select('BurnDate').setDefaultProjection(**{'crs':projection, 'scale': 250})
        
outdf = pd.DataFrame(columns = cols)
for cluster in tqdm(list(result_df.clusters.unique())):
    # Get row from cluster
    row = result_df.loc[result_df['clusters'] == cluster].iloc[0]
    # Get start and end date
    startDate = ee.Date(str(row['start_date'].date()))
    endDate = ee.Date(str(row['end_date'].date()))
    # For point extract data
    x = row['x']
    y = row['y']
    point = ee.Geometry.Point([x, y])

    # Filter Landsat for fire duration
    ds = lS8.filterBounds(point).filterDate(startDate, endDate)
    # Check if landsat scene exists
    try:
        dsSize = int(ds.size().getInfo())
    except:
        dsSize = 0
    if dsSize>0:
        t = ds.map(lambda img: img.set('id', img.id()))
        row['img_id'] = t.aggregate_array('id').getInfo()[0]

        # If scene not already in dataset
        while (outdf.shape[0]==0) | (row['img_id'] not in outdf['img_id'].unique()): 
        
            # Filter and map the fire collection
            fire = ee.ImageCollection("ESA/CCI/FireCCI/5_1")
            # Set projection and scale
            projection = fire.select('BurnDate').first().projection()
            startDOY = startDate.getRelative('day', 'year')
            endDOY = endDate.getRelative('day', 'year')
            year = startDate.get('year')
            month = startDate.get('month')
            date = ee.Date.fromYMD(year, month, 1)
            fire = fire \
                .filterBounds(point)\
                .filterDate(date, date.advance(1, 'month')) \
                .map(conf_mask)\
                .map(lambda img: img.gte(startDOY).And(img.lte(endDOY)).unmask(0))\
                .max().selfMask()
        
            area = ee.Image.pixelArea().divide(1e6)
            burnAreaImage = area.multiply(fire).rename('areakm2_BA')
            burnArea = burnAreaImage.reduceRegion(reducer = ee.Reducer.sum(),
                                                  geometry = t.geometry(),
                                                  scale = 250).get("areakm2_BA")
            row['areakm2_BA'] = burnArea.getInfo()
    
            # X
            # For each landsat scene compute percentiles for x prior weeks, extract data
            
            date = ee.Image(t.first()).date()
            
            CO = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_CO")\
            .filterBounds(t.geometry()).select('CO_column_number_density')
            reducer = ee.Reducer.percentile([5, 25, 50, 75, 95, 99])
            weeks = ee.List(list(range(-8, 0)))
            # Get temporal percentiles per week
            outic = weeks.map(lambda week: CO.filterDate(date.advance(week, 'week'), date.advance(ee.Number(week).add(1), 'week')).reduce(reducer).clip(t.geometry()))
            # Compute variance across weeks
            result = ee.Image(ee.ImageCollection(outic).reduce(ee.Reducer.variance())).regexpRename('^[^_]*_p', '')
            # Compute spatial percentiles for variance image
            predictors = result.reduceRegion(reducer = reducer,
                                              geometry = t.geometry(),
                                              scale = 1000)
            row = row.append(pd.Series(predictors.getInfo(), name='predictors'))
            row.name = str(row.id)
            outdf = outdf.append(row)
    # print(outdf.shape)

  0%|          | 0/1475 [00:00<?, ?it/s]

In [12]:
# v2_2019_08.csv
outdf.to_csv(r'C:\Users\coach\myfiles\postdoc\Fire\data\v2_2020_08.csv')

In [13]:
# Extract negative examples (no fires)

# get all landsat scenes for SA for year and month of loaded modis fire pixels (df)

year = df['date'][0].year
month = df['date'][0].month
date = ee.Date.fromYMD(year, month, 1)
ds = lS8.filterDate(date, date.advance(1, 'month')).filterBounds(sa_geo).map(lambda img: img.set('id', img.id()))
# get a list of scene ids
ids = ds.aggregate_array('id').getInfo()

noutdf = pd.DataFrame()
for id in tqdm(ids):
    outDict = {}
    if id not in outdf['img_id']:
        # set scene id
        outDict['img_id'] = id 
        # set response (burn area) to 0
        outDict['areakm2_BA'] = 0

        # X
        # For each landsat scene compute percentiles for x prior weeks, extract data

        # Select Landsat scene
        img = ee.Image(ds.filter(ee.Filter.eq('id', id)).first())
        # get date of image
        date = img.date()

        # prepare predictors (carbon monoxide)
        CO = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_CO")\
        .filterBounds(img.geometry()).select('CO_column_number_density')
        reducer = ee.Reducer.percentile([5, 25, 50, 75, 95, 99])
        weeks = ee.List(list(range(-8, 0)))
        outic = weeks.map(lambda week: CO.filterDate(date.advance(week, 'week'), date.advance(ee.Number(week).add(1), 'week')).reduce(reducer).clip(img.geometry()))
        result = ee.Image(ee.ImageCollection(outic).reduce(ee.Reducer.variance())).regexpRename('^[^_]*_p', '')
        predictors = result.reduceRegion(reducer = reducer,
                                          geometry = img.geometry(),
                                          scale = 1000).getInfo()
        outDict.update(predictors)
        noutdf = noutdf.append(pd.Series(outDict, name='instance'))
    # print(noutdf.shape)

  0%|          | 0/154 [00:00<?, ?it/s]

In [14]:
# v2_2019_08_neg.csv
noutdf.reset_index(drop = True).to_csv(r'C:\Users\coach\myfiles\postdoc\Fire\data\v2_2020_08_neg.csv')

### Exploration: Plot expanding window of percentiles prior to landsat image

In [10]:
# X
# For each landsat scene compute percentiles for x prior weeks, extract data

date = ee.Image(t.first()).date()

CO = ee.ImageCollection("COPERNICUS/S5P/OFFL/L3_CO")\
.filterBounds(t.geometry()).select('CO_column_number_density')
reducer = ee.Reducer.percentile([5, 25, 50, 75, 95, 99])
weeks = ee.List(list(range(-8, 0)))
outic = weeks.map(lambda week: CO.filterDate(date.advance(week, 'week'), date).reduce(reducer).clip(t.geometry()))
result = ee.ImageCollection(outic).reduce(ee.Reducer.variance())
result = result.select("CO_column_number_density_p95_variance").subtract(result.select("CO_column_number_density_p5_variance"))
eeprint(result)

In [11]:
band = "CO_column_number_density_p95_variance"
min = result.select(band).reduceRegion(reducer = ee.Reducer.min(),
                                          geometry = t.geometry(),
                                          scale = 1000).get(band)
# eeprint(min)
max = result.select(band).reduceRegion(reducer = ee.Reducer.max(),
                                          geometry = t.geometry(),
                                          scale = 1000).get(band)
# eeprint(max)

In [12]:
Map = geemap.Map()
Map.centerObject(point, 8)
Map.addLayer(result, {'bands':band, 'min': min, 'max': max})
Map.addLayer(point, {'color': 'red'})
Map

Map(center=[-33.83616638183594, 24.26237297058105], controls=(WidgetControl(options=['position', 'transparent_…

## Wishlist
- Merge fires across months
- Try maxent
- Try larger dataset
- Calculate the lag upload for a dataset