In [None]:

def create_mask(ds, mask_type,granule_id=None,
                title=None):
    """
    Refer: HLS User Guide v2 Table 9 and Appendix: 
    """
    fmask_array = ds[['Fmask']].squeeze().to_array()

    
    # Mask cloud and snow, water
    # prepend with 0B for python
    
    # updated version:
    if 'cloud' in mask_type:
        bits   = [66, 74, 130, 136, 194, 200, 202]
    elif 'water' in mask_type:
        bits    = [96, 160, 224] ## [01100000, 10100000, 11100000]
        
    elif 'snow' in mask_type:
        bits    = [80, 144, 208] ## [01010000, 10010000, 11010000]
        
    elif 'aerosol' in mask_type:
        bits = [192]          ## [11000000] only high aerosol component is considered

        
    else:
        raise ValueError("Choose valid mask option")
        
    
    mask = np.isin(fmask_array, bits)
        
        
        
    return mask

def convert_fmask_unique(raster):
    """
    Refer: HLS User Guide v2 Table 9 and Appendix: 
    For convienent for comparision, conver HLS Fmask to unique values to match with original Fmask outputs
    """

    target_values = [[66, 74, 130, 136, 194, 200, 202], #cloud
                     [68, 72, 132, 144, 196, 200], # cloud shadow
                    [96, 160, 224], #water [01100000, 10100000, 11100000]
                     [80, 144, 208], #snow
                    ]
    

    
    new_values = [4,2,1,3]

    for i in range(len(target_values)):                                         
        mask = np.isin(raster, list(target_values[i]))
        raster[mask] = new_values[i]

    mask_nan = ~np.isin(raster, new_values)
    raster[mask_nan] = np.nan

        
    return raster


        
def apply_mask(band_data, mask):
    """
        Apply mask to the band data
    """
    masked_raster = band_data.where(mask!=True)       
    return masked_raster

def get_fmask_info(i):
    """
    Return fmask version info and title based on index.

    Parameters:
        i (int): Index (0, 1, or other).
        day (str, optional): Day string to format (e.g. '2025_11_05').

    Returns:
        dict: {
            'fmask_version': str,
            'title': str,
            'day_text': str or None
        }
    """
    if i == 0:
        fmask_version = ''
        title = 'Reference'

    elif i == 1:
        fmask_version = 'fmask4.7'
        title = 'Fmask 4.7'
        
    elif i == 3:
        fmask_version = 'cirrus'
        title = 'Cirrus band TOA'
        
    else:
        fmask_version = 'fmask5_May2025'
        title = 'Fmask 5'

    return {
        'fmask_version': fmask_version,
        'title': title,
    }


In [None]:
def get_fmask_percentage_compare(files_s3_fmask5, files_s3_fmask4, days):
    
    df_concat_list = list()
    for day in days:
        # convert date to julian date
        day_julian = to_julian_date(day.split('T')[0])
    
        # get list of mrgs_ids within this certain date
        mrgs_ids = np.unique([i.split('_')[6] for i in files_s3_fmask5 if day in i])
    
        # check if we have both fmask4 and fmask5 for the same granule, otherwise skip
        matches = [f for f in files_s3_fmask4 if day_julian in f and any(i in f for i in mrgs_ids) if f.endswith('.tif')]

        #update the list of matched ids
        mrgs_ids = np.unique([i.split('.')[2] for i in matches])
        
        if len(matches) != 5:
            print('No matching for this granules between Fmask 4 and 5')
            pass
        else:
            for mrgs in mrgs_ids:
                matches_filtered = [f for f in matches if mrgs in f]
                
    
                file_fmask4 = [i for i in matches_filtered if 'Fmask.tif' in i][0]
                file_fmask5 = [i for i in files_s3_fmask5 if day in i if mrgs in i][0]
                df_list = list()
                for file in [file_fmask4,file_fmask5]:
                    fmask_version = file.split('_')[0]
                    
                    raster = rasterio.open('s3://'+bucket_name+'/'+file).read(1).astype(np.float16)
    
                    # simply the fmask4 to the same unique clases as in fmask 5 for comparision
                    if 'Fmask4' in fmask_version:
                        raster = convert_fmask_unique(raster)
                    
                    # mask out 255 to nan
                    raster[raster == 255] = np.nan
                    raster[raster == 0] = np.nan #ignore the land classification
                    
                    
                    df = pd.DataFrame()
                    unique_class = np.unique(raster[~np.isnan(raster)])
                    df['Date'] = [datetime.strptime(day.split('_')[0], '%Y%m%dT%H%M%S')]*len(unique_class)
                    df['mrgs_id'] = [mrgs]*len(unique_class)
                    df['fmask_version'] = [fmask_version]*len(unique_class)
                    # df['id'] = [day]*len(unique_class)
                    
                    # loop over each code 
                    classes_all = ['Land/NaN','Water','Cloud Shadow','Snow/Ice','Cloud']
                    classes_uni = [classes_all[int(i)] for i in unique_class]
                    
                    percentage_list = list()
                    for code in unique_class:
                        raster_copy = raster.copy()
                        
                        raster_copy[raster_copy == code] = 10
                        raster_copy[raster_copy != 10] = 0
                        
                        # percentage 
                        percentage = np.count_nonzero(raster_copy
                                                       )/(raster_copy.shape[0]*raster_copy.shape[1]
                                                          )*100   
                        percentage_list.append(percentage)                                 
                                                     
                    df['Features'] = classes_uni
                    df['Percentage'] = percentage_list
                    
                    df_list.append(df)
                    
                df_concat = pd.concat(df_list)
                
                df_concat_list.append(df_concat)
      

    df_concat_final = pd.concat(df_concat_list)

    return df_concat_final