# Data extraction methodology

## Indices
* preprocess images
* calculate indices

We extract the following spectral indices for impervious surfaces, water bodies and vegetation:   
* **NDBI**, **NDBaI**
* **MNDWI**, **NDWI**,
* **NDVI**, **EVI**, **SAVI**


## Population density
* Use closest GHSL population dataset

## LST(Land surface temperature)
* Use Landsat Collection 2 Surface temperature bands
  * for Landsat 8 - band 10
  * for Landsat 4/5/7 - band 6

In [None]:
# !pip install eemont
# !pip install ee
# !pip install geopandas
# !pip install rasterio
# !pip install earthengine-api
# !pip install earthengine-api --upgrade

In [None]:
import ee
import eemont
import geopandas as gpd
import rasterio
import json

In [None]:
ee.Authenticate()
ee.Initialize(project="ee-dianamarkovakn")

In [None]:
aoi = ee.Geometry.Polygon([
  [[23.032164119466383, 42.91889685342199],
   [23.032164119466383, 42.39372184157957],

   [23.710569881185133, 42.39372184157957],
   [23.710569881185133, 42.91889685342199]]
]);

In [None]:
spectral_incides = ['NDBI', 'NDBaI',	'MNDWI',	'NDWI',	'NDVI',	'EVI',	'SAVI']

In [None]:
ST_bands = {'LE07': 'ST_B6',
            'LT05': 'ST_B6',
            'LC08': 'ST_B10', }

Red = {'LE07': 'SR_B3',
       'LT05': 'SR_B3',
       'LC08': 'SR_B4'}

N = {'LE07': 'SR_B4',
       'LT05': 'SR_B4',
       'LC08': 'SR_B5'}

Green = {'LE07': 'SR_B2',
         'LT05': 'SR_B2',
         'LC08': 'SR_B3'}

Blue = {'LE07': 'SR_B1',
        'LT05': 'SR_B1',
        'LC08': 'SR_B2'}

S1 = {'LE07': 'SR_B5',
         'LT05': 'SR_B5',
         'LC08': 'SR_B6'}

S2 = {'LE07': 'SR_B7',
         'LT05': 'SR_B7',
         'LC08': 'SR_B7'}


In [None]:
id_list = [{'id': 'LC08_184030_20191015',
  'collection': 'LC08',
  'season': 'autumn',
  'tier': 'T1',
  'ghs': 'JRC/GHSL/P2023A/GHS_POP/2020'},]

In [None]:
from collections import defaultdict
from datetime import datetime

with open('id_list.json', 'r') as f:
   id_list = json.load(f)


grouped = defaultdict(lambda: defaultdict(list))
for item in id_list:
    date_str = item['id'].split('_')[-1]
    date = datetime.strptime(date_str, "%Y%m%d")
    grouped[item['collection']][item['tier']].append(date)

# Display the results
for collection, tiers in grouped.items():
    print(f"Collection: {collection}")
    for tier, dates in tiers.items():
        print(f"  Tier: {tier}")
        for date in dates:
            print(f"    {date.strftime('%Y-%m-%d')}")



Collection: LE07
  Tier: T1
    1999-09-30
    2001-09-19
    2000-05-27
    2000-06-12
    2000-06-28
Collection: LT05
  Tier: T1
    1999-04-15
    2006-10-27
    2007-09-28
    2006-05-04
    2006-05-20
    2007-06-24
    2007-07-26
    2005-02-10
    2011-09-23
    2009-05-12
    2011-08-22
    2011-02-11
  Tier: T2
    2000-01-28
Collection: LC08
  Tier: T1
    2014-10-01
    2014-03-23
    2014-08-14
    2017-09-07
    2019-10-15
    2017-05-18
    2018-04-03
    2017-07-05
    2019-08-12
    2019-02-17


In [None]:
indices = eemont.indices()
indices.SAVI.formula


'(1.0 + L) * (N - R) / (N + R + L)'

In [None]:
def collect_indices():
    for item in id_list:
        date_str = item['id'].split('_')[-1]
        date: str  = datetime.strptime(date_str, "%Y%m%d")
        image_loc: str = f"LANDSAT/{item['collection']}/C02/{item['tier']}_L2/{item['id']}"
        print(f"image loc: {image_loc}, {date.strftime('%Y-%m-%d')}")

        sat = ee.Image(image_loc).maskClouds().resample('bilinear')
        pop = ee.Image(item['ghs']).resample('bilinear').clip(aoi).toFloat()

        #pop = ee.Image(item['ghs']).clip(aoi)
        # Now Calculate indices on the resampled image
        image = (sat.preprocess().clip(aoi))

        G = sat.select(Green[item['collection']])
        B = sat.select(Blue[item['collection']])
        R = sat.select(Red[item['collection']])
        NIR = sat.select(N[item['collection']])
        SWIR1 = sat.select(S1[item['collection']])
        SWIR2 = sat.select(S2[item['collection']])
        T = sat.select(ST_bands[item['collection']])

        NDBI = (SWIR1.subtract(NIR).divide(NIR.add(SWIR1))).rename('NDBI')
        NDBaI = (SWIR1.subtract(T).divide(SWIR1.add(T))).rename('NDBaI')
        MNDWI = (G.subtract(SWIR1).divide(G.add(SWIR1))).rename('MNDWI')
        NDWI = (G.subtract(NIR).divide(G.add(NIR))).rename('NDWI')
        NDVI = (NIR.subtract(R).divide(NIR.add(R))).rename('NDVI')
        EVI = (2.5 * (NIR.subtract(R).divide(NIR.add(6 * R).subtract(7.5 * B).add(1)))).rename('EVI').toFloat()
        SAVI = (NIR.subtract(R).divide(NIR.add(R).add(0.5)).multiply(1.5)).rename('SAVI')

        image = image.addBands(NDBI).addBands(NDBaI).addBands(MNDWI).addBands(NDWI).addBands(NDVI).addBands(EVI).addBands(SAVI)

        for index in ['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI']:
            min_max_dict = image.select(index).reduceRegion(
                reducer=ee.Reducer.minMax(),
                scale=120,
                geometry=aoi,
            ).getInfo()

            # Check if the index exists in the dictionary before accessing it
           # if index in min_max_dict and 'min' in min_max_dict[index] and 'max' in min_max_dict[index]:
            #    print(f"  Index: {index}, Min: {min_max_dict[index]['min']}, Max: {min_max_dict[index]['max']}")
            #else:
            #    print(f"  Index: {index} not found or missing min/max values in min_max_dict. "
            #          f"min_max_dict: {min_max_dict}")
        # Collect population
        image = image.addBands(srcImg=pop)

        # Collect LST
        LST = sat.preprocess().select(ST_bands[item['collection']]).subtract(273.15).rename('LST').toFloat()
        image = image.addBands(LST)

        name = image.id().getInfo()
        image = image[['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI', 'population_count', 'LST']]
        print(f"name: {name}")
        #print(image.bandTypes().getInfo())
        image = image.cast({'NDBI': 'float', 'NDBaI': 'float', 'MNDWI': 'float', 'NDWI': 'float',
                            'NDVI': 'float', 'EVI': 'float', 'SAVI': 'float', 'LST': 'float'})
        # Now we export
        task = ee.batch.Export.image.toDrive(**{
            'image': image[['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI', 'population_count', 'LST']],
             'scale': 120,
            'description': name,
            'folder': 'data-indices-again-again',
            'region': aoi.bounds()
        })
        task.start()

In [None]:
collect_indices()

image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_19990930, 1999-09-30
name: LE07_184030_19990930
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20010919, 2001-09-19
name: LE07_184030_20010919
image loc: LANDSAT/LT05/C02/T1_L2/LT05_184030_19990415, 1999-04-15
name: LT05_184030_19990415
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20000527, 2000-05-27
name: LE07_184030_20000527
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20000612, 2000-06-12
name: LE07_184030_20000612
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20000628, 2000-06-28
name: LE07_184030_20000628
image loc: LANDSAT/LT05/C02/T2_L2/LT05_184030_20000128, 2000-01-28
name: LT05_184030_20000128
image loc: LANDSAT/LT05/C02/T1_L2/LT05_184030_20061027, 2006-10-27
name: LT05_184030_20061027
image loc: LANDSAT/LT05/C02/T1_L2/LT05_184030_20070928, 2007-09-28
name: LT05_184030_20070928
image loc: LANDSAT/LT05/C02/T1_L2/LT05_184030_20060504, 2006-05-04
name: LT05_184030_20060504
image loc: LANDSAT/LT05/C02/T1_L2/LT05_184030_20060520, 2006

In [None]:
tasks = ee.batch.Task.list()
for task in tasks[:30]:

    print(task.status())

{'state': 'READY', 'description': 'LC08_184030_20190217', 'priority': 100, 'creation_timestamp_ms': 1728479260297, 'update_timestamp_ms': 1728479293854, 'start_timestamp_ms': 0, 'task_type': 'EXPORT_IMAGE', 'id': 'VQ64AXT75IJCJQE6VDOARYWE', 'name': 'projects/ee-dianamarkovakn/operations/VQ64AXT75IJCJQE6VDOARYWE'}
{'state': 'READY', 'description': 'LC08_184030_20190812', 'priority': 100, 'creation_timestamp_ms': 1728479257132, 'update_timestamp_ms': 1728479277558, 'start_timestamp_ms': 0, 'task_type': 'EXPORT_IMAGE', 'id': 'WRGSNRGJ5JXOYDYPPAESUHOC', 'name': 'projects/ee-dianamarkovakn/operations/WRGSNRGJ5JXOYDYPPAESUHOC'}
{'state': 'READY', 'description': 'LC08_184030_20170705', 'priority': 100, 'creation_timestamp_ms': 1728479253902, 'update_timestamp_ms': 1728479269080, 'start_timestamp_ms': 0, 'task_type': 'EXPORT_IMAGE', 'id': 'CJSR44JQ767D6KUAERHB3WEJ', 'name': 'projects/ee-dianamarkovakn/operations/CJSR44JQ767D6KUAERHB3WEJ'}
{'state': 'READY', 'description': 'LC08_184030_20180403

In [None]:
def collect_pop():
    for year in range(1975, 2025, 5):
        pop = ee.Image(f"JRC/GHSL/P2023A/GHS_POP/{year}").clip(aoi)

        name = pop.id().getInfo()

        print(f"name: {name}")

        # Now we export and resample
        task = ee.batch.Export.image.toDrive(**{
            'image': pop,
            'description': name,
            'folder': 'population-ghs-aoi-clipped',
            'scale': 120,
            'region': aoi.bounds()
        })
        task.start()


In [None]:
collect_pop()

name: 2020


In [None]:
# tasks = ee.batch.Task.list()
# for task in tasks[:30]:
#     print(task.status())