# Data extraction methodology

## Indices
* preprocess images
* calculate indices

We extract the following spectral indices for impervious surfaces, water bodies and vegetation:   
* **NDBI**, **NDBaI**
* **MNDWI**, **NDWI**,
* **NDVI**, **EVI**, **SAVI**


## Population density
* Use closest GHSL population dataset

## LST(Land surface temperature)
* Use Landsat Collection 2 Surface temperature bands
  * for Landsat 8 - band 10
  * for Landsat 4/5/7 - band 6

In [18]:
# !pip install eemont
# !pip install ee
# !pip install geopandas
# !pip install rasterio
# !pip install earthengine-api
# !pip install earthengine-api --upgrade

In [153]:
import ee
import eemont
import geopandas as gpd
import rasterio
import json

In [20]:
ee.Authenticate()
ee.Initialize(project="ee-dianamarkovakn")

In [21]:
aoi = ee.Geometry.Polygon([
  [[23.032164119466383, 42.91889685342199],
   [23.032164119466383, 42.39372184157957],

   [23.710569881185133, 42.39372184157957],
   [23.710569881185133, 42.91889685342199]]
]);

In [74]:
spectral_incides = ['NDBI', 'NDBaI',	'MNDWI',	'NDWI',	'NDVI',	'EVI',	'SAVI']

In [128]:
ST_bands = {'LE07': 'ST_B6',
            'LT05': 'ST_B6',
            'LC08': 'ST_B10', }

In [152]:
from collections import defaultdict
from datetime import datetime

with open('id_list.json', 'r') as f:
  id_list = json.load(f)


grouped = defaultdict(lambda: defaultdict(list))
for item in id_list:
    date_str = item['id'].split('_')[-1]
    date = datetime.strptime(date_str, "%Y%m%d")
    grouped[item['collection']][item['tier']].append(date)

# Display the results
for collection, tiers in grouped.items():
    print(f"Collection: {collection}")
    for tier, dates in tiers.items():
        print(f"  Tier: {tier}")
        for date in dates:
            print(f"    {date.strftime('%Y-%m-%d')}")



Collection: LE07
  Tier: T1
    1999-09-30
    2001-09-19
    2000-05-27
    2000-06-12
    2000-06-28
Collection: LT05
  Tier: T1
    1999-04-15
    2006-10-27
    2007-09-28
    2006-05-04
    2006-05-20
    2007-06-24
    2007-07-26
    2005-02-10
    2011-09-23
    2009-05-12
    2011-08-22
    2011-02-11
  Tier: T2
    2000-01-28
Collection: LC08
  Tier: T1
    2014-10-01
    2014-03-23
    2014-08-14
    2017-09-07
    2019-10-15
    2017-05-18
    2018-04-03
    2017-07-05
    2019-08-12
    2019-02-17


In [154]:
print(id_list)

[{'id': 'LE07_184030_19990930', 'collection': 'LE07', 'season': 'winter', 'tier': 'T1', 'ghs': 'JRC/GHSL/P2023A/GHS_POP/2000'}, {'id': 'LE07_184030_20010919', 'collection': 'LE07', 'season': 'autumn', 'tier': 'T1', 'ghs': 'JRC/GHSL/P2023A/GHS_POP/2000'}, {'id': 'LT05_184030_19990415', 'collection': 'LT05', 'season': 'spring', 'tier': 'T1', 'ghs': 'JRC/GHSL/P2023A/GHS_POP/2000'}, {'id': 'LE07_184030_20000527', 'collection': 'LE07', 'season': 'spring', 'tier': 'T1', 'ghs': 'JRC/GHSL/P2023A/GHS_POP/2000'}, {'id': 'LE07_184030_20000612', 'collection': 'LE07', 'season': 'summer', 'tier': 'T1', 'ghs': 'JRC/GHSL/P2023A/GHS_POP/2000'}, {'id': 'LE07_184030_20000628', 'collection': 'LE07', 'season': 'summer', 'tier': 'T1', 'ghs': 'JRC/GHSL/P2023A/GHS_POP/2000'}, {'id': 'LT05_184030_20000128', 'collection': 'LT05', 'season': 'winter', 'tier': 'T2', 'ghs': 'JRC/GHSL/P2023A/GHS_POP/2000'}, {'id': 'LT05_184030_20061027', 'collection': 'LT05', 'season': 'autumn', 'tier': 'T1', 'ghs': 'JRC/GHSL/P2023A

In [144]:
def collect_indices():
    for item in id_list:
        date_str = item['id'].split('_')[-1]
        date: str  = datetime.strptime(date_str, "%Y%m%d")
        image_loc: str = f"LANDSAT/{item['collection']}/C02/{item['tier']}_L2/{item['id']}"
        print(f"image loc: {image_loc}, {date.strftime('%Y-%m-%d')}")

        pop = ee.Image(item['ghs']).clip(aoi)
        sat = ee.Image(image_loc)

        # Collect indices
        image = (sat.preprocess()
                    .spectralIndices(spectral_incides)
                    .clip(aoi))[['NDBI', 'NDBaI',	'MNDWI',	'NDWI',	'NDVI',	'EVI',	'SAVI']]

       # Collect population
        image = image.addBands(srcImg=pop)

       # Collect LST
        LST = sat.preprocess().select(ST_bands[item['collection']]).subtract(273.15).rename('LST')
        image = image.addBands(LST)


        image = image.resample('bilinear')
        name = image.id().getInfo()

        print(f"name: {name}")
        print(image.bandNames().getInfo())

        # Now we export and resample
        task = ee.batch.Export.image.toDrive(**{
            'image': image,
            'description': name,
            'folder': 'data-indices-population-lst',
            'scale': 120,
            'region': aoi.bounds()
        })
        task.start()


In [145]:
collect_indices()

image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_19990930, 1999-09-30
name: LE07_184030_19990930
['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI', 'population_count', 'LST']
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20010919, 2001-09-19
name: LE07_184030_20010919
['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI', 'population_count', 'LST']
image loc: LANDSAT/LT05/C02/T1_L2/LT05_184030_19990415, 1999-04-15
name: LT05_184030_19990415
['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI', 'population_count', 'LST']
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20000527, 2000-05-27
name: LE07_184030_20000527
['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI', 'population_count', 'LST']
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20000612, 2000-06-12
name: LE07_184030_20000612
['NDBI', 'NDBaI', 'MNDWI', 'NDWI', 'NDVI', 'EVI', 'SAVI', 'population_count', 'LST']
image loc: LANDSAT/LE07/C02/T1_L2/LE07_184030_20000628, 2000-06-28
name: LE07_184030_20000628
['NDBI', 'N

In [150]:
tasks = ee.batch.Task.list()
for task in tasks[:30]:

    print(task.status())

{'state': 'COMPLETED', 'description': 'LC08_184030_20190217', 'priority': 100, 'creation_timestamp_ms': 1727958401521, 'update_timestamp_ms': 1727958618227, 'start_timestamp_ms': 1727958588848, 'task_type': 'EXPORT_IMAGE', 'destination_uris': ['https://drive.google.com/#folders/1govcJTxphqKp72gUj1XCfop48vG5bh4A'], 'attempt': 1, 'batch_eecu_usage_seconds': 5.7516770362854, 'id': 'JAUKHTEWJUCE3F2Y5CYOUBOY', 'name': 'projects/ee-dianamarkovakn/operations/JAUKHTEWJUCE3F2Y5CYOUBOY'}
{'state': 'COMPLETED', 'description': 'LC08_184030_20190812', 'priority': 100, 'creation_timestamp_ms': 1727958400351, 'update_timestamp_ms': 1727958593620, 'start_timestamp_ms': 1727958574734, 'task_type': 'EXPORT_IMAGE', 'destination_uris': ['https://drive.google.com/#folders/1govcJTxphqKp72gUj1XCfop48vG5bh4A'], 'attempt': 1, 'batch_eecu_usage_seconds': 4.034004211425781, 'id': 'WUABH37AFWXK5IEQ3NTT5CDE', 'name': 'projects/ee-dianamarkovakn/operations/WUABH37AFWXK5IEQ3NTT5CDE'}
{'state': 'COMPLETED', 'descript