# notebook for changing filenames of dupe hires tiles

A handful of tiles in the HIRES mosaic have duplicate pds3 product IDs.
This goes through the data set and changes their filenames and pds4
LIDs to make them unique.

In [None]:
import fs

from clem_bulk import crude_time_log
from clem_conversion import *

In [None]:
standard_mosaic_products = pd.read_csv(
    './directories/clementine/standard_mosaic_products.csv'
)
duped_tiles = standard_mosaic_products.loc[
    standard_mosaic_products.duplicated(subset=['pds3_product_id'], keep=False)
]

In [None]:
# set root directories for your input and output data sets
input_root = '/home/ubuntu/buckets/clem_input/'
output_root = '/home/ubuntu/buckets/clem_reprocess/'
# this roundabout temp directory thing is only necessary to prevent gdal from breaking
# when it tries to write directly to a s3fs-fuse 'filesystem'. it would have
# no purpose in other configurations.
temp_output_directory = '/home/ubuntu/data_temp/'

# associate hires source index with an index of edr products that includes time
# in order to determine start and stop times for each hires mosaic tile
hires_source_groupby = pd.read_csv(
    './directories/clementine/hires_source_index.csv',
).groupby('tilename')

In [None]:
# convert all the files in this chunk 
for ix, tile in enumerate(duped_tiles.itertuples()):
    print("Converting " + fs.path.split(tile.file)[1])
    
    tile_start_time = dt.datetime.now() # just for logging
    
    source_path = tile.file
    source_groups = hires_source_groupby

    destination_path = tile.newpath
    # just a double-check to avoid any possible extra '/' in S3 object names --
    # in some cases it can cause really irritating problems.
    if output_root.endswith('/') and destination_path.startswith('/'):
        destination_path = destination_path[1:]
    sh.mkdir("-p", output_root + destination_path)
    volume = source_path[3:7]
    
    # initialize writer & convert product 
    writer = ClemMosaicConverter(
        input_root + source_path,
        source_groups = source_groups
    )
    
    # in case you wrote out unflagged versions before
    for extension in ['.xml', '.tif']:
        try:
            os.remove(
                output_root[:-1] + destination_path + writer.pds4_root + extension, 
            )
        except FileNotFoundError:
            print('no existing file')
            continue
    
    writer.pds4_root = writer.pds4_root + '_' + volume
    writer.write_pds4(temp_output_directory)
    

    for extension in ['.xml', '.tif']:
        sh.mv(
            temp_output_directory + writer.pds4_root + extension, 
            output_root + destination_path,
            _bg=True
        )

    # very simple logger
    elapsed = str((dt.datetime.now() - tile_start_time).total_seconds())
    crude_time_log(
        'mosaic_conversion_log_hires_dupes.csv',
        writer,
        elapsed
    )
    print("total seconds: " + elapsed)