In [1]:
import rasterio
import pandas as pd
from pathlib import Path

In [2]:
from osgeo import gdal
import datacube
import boto3

dc = datacube.Datacube(app="04_Loading_data")

In [154]:
# Read LS7 FC datasets between 2021 to 2022

query = {}                  

query['time'] = ("2021-01-01", "2022-12-31")
query['dataset_maturity'] = 'final'

ls7_fc_datasets = dc.find_datasets(product='ga_ls_fc_3', platform='landsat-7' , **query)
ls7_fc_ids = [str(x.id) for x in ls7_fc_datasets]
ls7_fc_paths = [str(x.uris[0]) for x in ls7_fc_datasets]

In [95]:
# Read LS7/9 ard datasets between 2021 to 2022

query = {}                  

query['time'] = ("2021-01-01", "2022-12-31")
query['dataset_maturity'] = 'final'

ls9_ard_datasets = dc.find_datasets(product='ga_ls9c_ard_3' , **query)
ls7_ard_datasets = dc.find_datasets(product='ga_ls7e_ard_3', **query)

wof_ls9_overlapping_scene = pd.read_csv("wo_ls9_overwrite_ls7.csv") # these are good scenes where gqa < 1 for ls9     

In [96]:
len(ls9_ard_datasets), len(ls7_ard_datasets)

(10876, 9909)

In [97]:
# find overlapping datasets
ls9_path = [str(Path(x.uris[0].replace('s3://dea-public-data/baseline/ga_ls9c_ard_3/', '')).parent) for x in ls9_ard_datasets]
ls9_ids = [str(x.id) for x in ls9_ard_datasets]
ls7_path = [str(Path(x.uris[0].replace('s3://dea-public-data/baseline/ga_ls7e_ard_3/', '')).parent) for x in ls7_ard_datasets]

In [104]:
# Find overlapping and non-overlapping datasets from path
ls7_common_ls9_cmn = list(set(ls9_path).intersection(set(ls7_path)))
ls9_non_overlapping = list(set(ls9_path) - set(ls7_path))
len(ls9_non_overlapping) , len(ls7_common_ls9_cmn)
# (9655, 1221)

(9655, 1221)

In [134]:
ls9_non_overlapping_ids =[]

for x in ls9_ard_datasets:
    base_path_ls9 = str(Path(x.uris[0].replace('s3://dea-public-data/baseline/ga_ls9c_ard_3/', '')).parent) 
    if base_path_ls9 in ls9_non_overlapping:
        ls9_non_overlapping_ids.append(str(x.id))

In [None]:
# find the LS9 ids overlapping LS7 where gqa < 1 based on wofs datstets ids

In [107]:
# Now extract the LS9 ids for the overlapping scenes
ls9_common_ids = []
ls9_paths = []

for ls9_ds in ls9_ard_datasets:    
    ls9_pth = str(Path(ls9_ds.uris[0].replace('s3://dea-public-data/baseline/ga_ls9c_ard_3/', '')).parent)    
    
    if ls9_pth in ls7_common_ls9_cmn:
        ls9_common_ids.append(str(ls9_ds.id))
        ls9_paths.append(ls9_pth)

ls9_cmn = pd.DataFrame({"ls9_ids": ls9_common_ids, "prefix": ls9_paths}, index=None)     

In [108]:
# Now extract the LS7 ids for the overlapping scenes

ls7_common_ids = []
ls7_paths = []

for ls7_ds in ls7_ard_datasets:    
    ls7_pth = str(Path(ls7_ds.uris[0].replace('s3://dea-public-data/baseline/ga_ls7e_ard_3/', '')).parent)    
    
    if ls7_pth in ls7_common_ls9_cmn:
        ls7_common_ids.append(str(ls7_ds.id))
        ls7_paths.append(ls7_pth)
        
ls7_cmn = pd.DataFrame({"ls7_ids": ls7_common_ids, "prefix": ls7_paths}, index=None)             

In [109]:
ls9_gqa_more_than_one_to_process = list(set(wof_ls9_overlapping_scene.ls9_id).intersection(set(ls9_common_ids)))
ls9_valid_gqa_df = pd.DataFrame({"ls9_ids": ls9_gqa_more_than_one_to_process, "gqa_valid": True})

In [110]:
len(ls9_valid_gqa_df)

1153

In [111]:
df_merged = ls9_cmn.merge(ls7_cmn, on='prefix', how='left')

In [112]:
df_merged = df_merged.merge(å,  on='ls9_ids', how='left')

In [113]:
len(df_merged), len(ls9_cmn), len(ls7_cmn)

(1221, 1221, 1221)

In [114]:
df_merged = df_merged[df_merged['gqa_valid'] == True]

In [156]:
df_merged.head(2)

Unnamed: 0,ls9_ids,prefix,ls7_ids,gqa_valid
0,58a5f919-1e49-4e8e-90ba-ed9c0a0fe42a,090/089/2022/03/02,9d3f1fbd-994d-4e8d-b8e9-6bb4cd165b5c,True
1,58ea4512-2793-44a0-ae49-08ce7ea36892,091/084/2022/02/05,712dcccf-5668-425e-8e23-24d04e1b156f,True


In [139]:
# Write the full list of LS9 ARD datasets to process 

In [136]:
ls9_ids_to_process = list(df_merged.ls9_ids) + ls9_non_overlapping_ids

In [137]:
ls9_ard_ids_21_22 = ' '.join(ls9_ids_to_process)
with open("ls9_ard_ids_21_22.txt", "w") as file:                     
        file.write(ls9_ard_ids_21_22)

In [142]:
# compile the list to archive
ids = []
path = []

ids = [x.id for x in ls7_fc_scenes]
ls7_fc_scene_prefix = [str(x.uris[0]) for x in ls7_fc_scenes]

ls7_21_22_scenes = pd.DataFrame({"ls7_fc_id": ids, "ls_fc_prefix": ls7_fc_scene_prefix})

In [166]:
 df_merged.prefix

0       090/089/2022/03/02
1       091/084/2022/02/05
3       113/081/2022/03/20
4       090/089/2022/01/29
5       109/070/2021/10/31
               ...        
1216    110/083/2021/12/25
1217    109/070/2022/01/03
1218    107/070/2022/01/21
1219    090/085/2022/04/03
1220    109/070/2021/12/18
Name: prefix, Length: 1153, dtype: object

In [None]:
ls7_fc_scenes_to_archive = []
ls7_fc_paths_to_archive = []

for ls7_fc_id, ls7_fc_path in zip(ls7_fc_ids, ls7_fc_paths):    
    base_path = str(Path(ls7_fc_path.replace('s3://dea-public-data/derivative/ga_ls_fc_3/2-5-0/', '')).parent)            
    if base_path in df_merged.prefix.values:
        
        ls7_fc_scenes_to_archive.append(ls7_fc_id)
        # ls7_fc_paths_to_archive.append(path)
len(ls7_fc_scenes_to_archive        )

In [149]:
ls7_fc_scenes_to_archive_21_22 = ' '.join(ls7_fc_scenes_to_archive)
with open("ls7_fc_scenes_to_archive_21_22.txt", "w") as file:                     
        file.write(ls7_fc_scenes_to_archive_21_22)

In [164]:
len(ls7_fc_scenes_to_archive)

0