In [1]:
import rasterio
import pandas as pd
from pathlib import Path

In [2]:
from osgeo import gdal
import datacube
import boto3

dc = datacube.Datacube(app="04_Loading_data")

In [95]:
# Read ard datasets between 2021 to 2022

query = {}                  

query['time'] = ("2021-01-01", "2022-12-31")
query['dataset_maturity'] = 'final'

ls9_ard_datasets = dc.find_datasets(product='ga_ls9c_ard_3' , **query)
ls7_ard_datasets = dc.find_datasets(product='ga_ls7e_ard_3', **query)

wof_ls9_overlapping_scene = pd.read_csv("wo_ls9_overwrite_ls7.csv") # these are good scenes where gqa < 1 for ls9     

In [96]:
len(ls9_ard_datasets), len(ls7_ard_datasets)

(10876, 9909)

In [97]:
ls9_path = [str(Path(x.uris[0].replace('s3://dea-public-data/baseline/ga_ls9c_ard_3/', '')).parent) for x in ls9_ard_datasets]
ls9_ids = [str(x.id) for x in ls9_ard_datasets]
ls7_path = [str(Path(x.uris[0].replace('s3://dea-public-data/baseline/ga_ls7e_ard_3/', '')).parent) for x in ls7_ard_datasets]

In [98]:
ls9_non_overlapping_paths = list(set(ls9_path) - set(ls7_path))

In [101]:
len(ls9_non_overlapping_paths)

9655

In [104]:
ls7_common_ls9_cmn = list(set(ls9_path).intersection(set(ls7_path)))
ls9_non_overlapping = list(set(ls9_path) - set(ls7_path))
len(ls9_non_overlapping) , len(ls7_common_ls9_cmn)
# (9655, 1221)

(9655, 1221)

In [134]:
ls9_non_overlapping_ids =[]

for x in ls9_ard_datasets:
    base_path_ls9 = str(Path(x.uris[0].replace('s3://dea-public-data/baseline/ga_ls9c_ard_3/', '')).parent) 
    if base_path_ls9 in ls9_non_overlapping:
        ls9_non_overlapping_ids.append(str(x.id))

In [107]:
# Now extract the LS9 ids for the overlapping scenes
ls9_common_ids = []
ls9_paths = []

for ls9_ds in ls9_ard_datasets:    
    ls9_pth = str(Path(ls9_ds.uris[0].replace('s3://dea-public-data/baseline/ga_ls9c_ard_3/', '')).parent)    
    
    if ls9_pth in ls7_common_ls9_cmn:
        ls9_common_ids.append(str(ls9_ds.id))
        ls9_paths.append(ls9_pth)

ls9_cmn = pd.DataFrame({"ls9_ids": ls9_common_ids, "prefix": ls9_paths}, index=None)     

In [108]:
# Now extract the LS7 ids for the overlapping scenes

ls7_common_ids = []
ls7_paths = []

for ls7_ds in ls7_ard_datasets:    
    ls7_pth = str(Path(ls7_ds.uris[0].replace('s3://dea-public-data/baseline/ga_ls7e_ard_3/', '')).parent)    
    
    if ls7_pth in ls7_common_ls9_cmn:
        ls7_common_ids.append(str(ls7_ds.id))
        ls7_paths.append(ls7_pth)
        
ls7_cmn = pd.DataFrame({"ls7_ids": ls7_common_ids, "prefix": ls7_paths}, index=None)             

In [109]:
ls9_gqa_more_than_one_to_process = list(set(wof_ls9_overlapping_scene.ls9_id).intersection(set(ls9_common_ids)))
ls9_valid_gqa_df = pd.DataFrame({"ls9_ids": ls9_gqa_more_than_one_to_process, "gqa_valid": True})

In [110]:
len(ls9_valid_gqa_df)

1153

In [111]:
df_merged = ls9_cmn.merge(ls7_cmn, on='prefix', how='left')

In [112]:
df_merged = df_merged.merge(ls9_valid_gqa_df,  on='ls9_ids', how='left')

In [113]:
len(df_merged), len(ls9_cmn), len(ls7_cmn)

(1221, 1221, 1221)

In [114]:
df_merged = df_merged[df_merged['gqa_valid'] == True]

In [139]:
# Write the full list of LS9 ARD datasets to process 

In [136]:
ls9_ids_to_process = list(df_merged.ls9_ids) + ls9_non_overlapping_ids

In [137]:
ls9_ard_ids_21_22 = ' '.join(ls9_ids_to_process)
with open("ls9_ard_ids_21_22.txt", "w") as file:                     
        file.write(ls9_ard_ids_21_22)

In [142]:
# compile the list to archive
ids = []
path = []

ids = [x.id for x in ls7_fc_scenes]
ls7_fc_scene_prefix = [str(x.uris[0]) for x in ls7_fc_scenes]

ls7_21_22_scenes = pd.DataFrame({"ls7_fc_id": ids, "ls_fc_prefix": ls7_fc_scene_prefix})

In [145]:
ls7_fc_scenes_to_archive = []
ls7_fc_paths_to_archive = []

for fc_path in ls7_21_22_scenes.ls_fc_prefix:    
    for ls7_ds_gqa_valid, path in zip(df_merged.ls7_ids.values, df_merged.prefix.values):            
        if path in fc_path:
            ls7_fc_scenes_to_archive.append(ls7_ds_gqa_valid)
            ls7_fc_paths_to_archive.append(path)

In [149]:
ls7_fc_scenes_to_archive_21_22 = ' '.join(ls7_fc_scenes_to_archive)
with open("ls7_fc_scenes_to_archive_21_22.txt", "w") as file:                     
        file.write(ls7_fc_scenes_to_archive_21_22)

In [150]:

# fc_ls_7_paths_to_be_archived = [str(Path(x.replace("s3://dea-public-data/derivative/ga_ls_fc_3/2-5-0/", "")).parent) for x in ls7_fc_paths_to_archive]
# len(set(fc_ls_7_paths_to_be_archived).intersection(set(df_merged.prefix.values)))

In [127]:
# query = {}                  

# query['time'] = ("2023-01-01", "2023-12-31")
# query['dataset_maturity'] = 'final'

# ls9_ard_datasets = dc.find_datasets(product='ga_ls9c_ard_3', **query)
# ls9_ard_datasets_ids = [str(x.id) for x in ls9_ard_datasets]
# # space_separated_ds_ls9_2023 = ' '.join(ls9_ard_datasets_ids)
# with open("ls9_ard_prod_2023.txt", "w") as file:    
#      for ds in ls9_ard_datasets_ids:                
#         file.write(str(ds)+"\n")
        

10088

In [4]:
# query = {}
# query['time'] = ("2023-01-01", "2023-12-31")
# query['dataset_maturity'] = 'final'
# len(dc.find_datasets(product='ga_ls_fc_3', platform='landsat-9', **query))

10088

In [21]:
# query = {}
# query['time'] = ("2023-01-01", "2023-12-31")
# len(dc.find_datasets(product='ga_ls_fc_3', platform='landsat-8', **query))

10142

In [23]:
# query = {}                  
# ds_ids = []

# query['time'] = ("2024-05-01", "2024-05-31")
# query['dataset_maturity'] = 'final'

# ls8_fc_mature_datasets = dc.find_datasets(product='ga_ls_fc_3', platform='landsat-8',  **query)
# ls8_fc_mature_datasets

277

In [63]:
# fc_paths = []

# from pathlib import Path
# for x in ls8_fc_mature_datasets:
#     # print(x.uris)
#     pth = Path(x.uris[0].replace('s3://dea-public-data/derivative/ga_ls_fc_3/2-5-1/', '')).name.replace('ga_ls_fc_3_', '')
#     fc_paths.append(pth[7:17])

In [47]:
# query = {}                  
# ds_ids = []

# dc = datacube.Datacube(app="04_Loading_data")


# query['time'] = ("2024-05-01", "2024-05-31")
# query['dataset_maturity'] = 'final'

# ls8_mature_datasets = dc.find_datasets(product='ga_ls8c_ard_3', **query)


In [61]:
# ard_paths = []
# from pathlib import Path
# for x in ls8_mature_datasets:
#     pth = Path(x.uris[0].replace('s3://dea-public-data/baseline/ga_ls8c_ard_3', '')).name.replace('ga_ls8c_ard_3-2-1_', '')        
#     ard_paths.append(pth[7:17])

In [25]:
# from collections import Counter


# count_per_year = {}
# for dataset in datasets:
#     year = dataset.center_time.year
#     if year in count_per_year:
#         count_per_year[year] += 1
#     else:
#         count_per_year[year] = 1

# # Print the count per year
# print("Count per year:")
# for year, count in sorted(count_per_year.items()):
#     print(f"{year}: {count}")
# # Replace 'your_product_name' with the name of the product for which you want to count the number of datasets.

# # This code properly iterates over the datasets, extracts the acquisition years, and counts the number of datasets per year. Then, it prints the counts per year.

Count per year:
2021: 1194
2022: 9448
2023: 9785
2024: 2273


In [21]:
datasets[0].time.s

Range(begin=datetime.datetime(2023, 11, 14, 0, 56, 12, 576902, tzinfo=datetime.timezone.utc), end=datetime.datetime(2023, 11, 14, 0, 56, 42, 375502, tzinfo=datetime.timezone.utc))