In [None]:
# headers
import os
import random
import numpy as np
import pandas as pd
import skimage.io as io
from tqdm.notebook import tqdm, trange
import re

In [None]:
def is_clear(qa_value):
    Cirrus   = (qa_value>>0) & 1
    Cloud    = (qa_value>>1) & 1
    Adjacent = (qa_value>>2) & 1
    Shadow   = (qa_value>>3) & 1
    Snow     = (qa_value>>4) & 1
    is_clear = ( (Cloud==0) & (Cirrus==0) & (Adjacent==0) & (Shadow==0) & (Snow==0))
    return is_clear
def date_to_string(target):
    if target < 10: 
        target_str = f'00{target}'
    elif target <100:
        target_str = f'0{target}'
    else: target_str = str(target)
    if target <= 0: 
        print('Receiving date 0')
        return '000'
    return target_str
def search_pair(target, path_L, date_L, path_S, date_S, radius=1):
    L_dates = []
    S_dates = []
    for date in range(target-radius, target+radius+1):
        if date in date_L: L_dates.insert(len(L_dates), date)
        if date in date_S: S_dates.insert(len(S_dates), date)
    return L_dates, S_dates

In [None]:
def fuse_HLS(dates, year, scale, target_dir, start):
    path_L = os.listdir(f'../HLS/dataset/L30/{year}/{target_dir}')
    path_L.sort()
    path_L = path_L
    date_L = [int(name[19:22]) for name in path_L]
    path_S = os.listdir(f'../HLS/dataset/S30/{year}/{target_dir}')
    path_S.sort()
    path_S = path_S
    date_S = [int(name[19:22]) for name in path_S]
    L_head = path_L[0][:19]
    L_tail = path_L[0][22:]
    S_head = path_S[0][:19]
    S_tail = path_S[0][22:]
    L_bands = ['B01', 'B02','B03','B04','B05','B06','B07','Fmask']
    S_bands = ['B01', 'B02','B03','B04','B8A','B11','B12','Fmask']

    fuse_cube = np.ones((dates.shape[0], 3660//scale,3660//scale,9), dtype=np.int32) * -9999
    fuse_cube[:,:,:,8] = 0
    for d_i, target_date in enumerate(tqdm(dates)):
        L_dates, S_dates = search_pair(target_date, path_L, date_L, path_S, date_S)
        total_dates = len(L_dates)+len(S_dates)
        if total_dates == 0: continue
        else: base_img = np.empty((total_dates,3660//scale,3660//scale,9), dtype=np.int32)
        l_i = 0
        if len(L_dates) > 0:
            for l_i, d in enumerate(L_dates):
                folder_head = f'{L_head}{date_to_string(d)}'
                for p in path_L: 
                    if re.findall(f'{folder_head}.', p): folder_name=p
                for b_i, b in enumerate(L_bands):
                    base_img[l_i, :, :, b_i] = io.imread(f'../HLS/dataset/L30/{year}/{target_dir}/{folder_name}/{folder_name}.{b}.tif')[[i for i in range(start, 3660, scale)]][:,[i for i in range(start, 3660, scale)]]
            l_i += 1
        if len(S_dates) > 0:
            for s_i, d in enumerate(S_dates):
                folder_head = f'{S_head}{date_to_string(d)}'
                for p in path_S: 
                    if re.findall(f'{folder_head}.', p): folder_name=p
                for b_i, b in enumerate(S_bands):
                    base_img[s_i+l_i, :, :, b_i] = io.imread(f'../HLS/dataset/S30/{year}/{target_dir}/{folder_name}/{folder_name}.{b}.tif')[[i for i in range(start, 3660, scale)]][:,[i for i in range(start, 3660, scale)]]
        base_img[:,:,:,8] = is_clear(base_img[:,:,:,7].astype(int)) # 1 for clear, 0 for NOT clear or missing
        missing_map = (1-base_img[:,:,:,8]).astype(bool)
        base_img[missing_map,:7] = -9999
        if total_dates == 1:
            if len(L_dates) == 1:
                fuse_cube[d_i] = base_img[0]
            else:
                fuse_cube[d_i] = base_img[0]
        elif total_dates > 1:
            fuse_img = np.ones((3660//scale,3660//scale,9), dtype=np.int32) * -9999
            fuse_img[:,:,8] = 0
            date_bin = base_img[:,:,:,8].sum(axis=0)
            pix_bin = base_img[:,:,:,8].reshape(total_dates,-1).astype(int).sum(axis=1)
            sort_date = np.argsort(pix_bin)
            d1_maps = ((date_bin == 1) * base_img[:,:,:,8]).astype(bool)
            for t in range(total_dates):
                fuse_img[d1_maps[t]] = base_img[t,d1_maps[t]]
            dmore_maps = date_bin > 1
            for t in sort_date:
                if base_img[t,dmore_maps,8].sum()>0:
                    dmore_t_map = (base_img[t,:,:,8] == 1) * dmore_maps
                    fuse_img[dmore_t_map] = base_img[t, dmore_t_map]
            fuse_cube[d_i] = fuse_img
        else:
            print('error in dates')
            break
        fuse_cube[:,:,:,7] = fuse_cube[:,:,:,8]
        total_n = fuse_cube[:,:,:,-1].reshape(122,-1).T.sum(axis=1)
    return fuse_cube[:,:,:,:8], total_n

In [None]:
start_date, end_date, date_radius = 1,365,1
dates = np.arange(start_date,end_date+1,date_radius*2+1)
train_tiles = ['11/S/P/R','12/T/V/K','14/T/N/P','17/R/N/J','18/T/W/Q']
valid_tiles = ['11/S/P/S','12/T/V/L','14/T/P/P','17/R/N/K','18/T/W/P']
test_tiles = ['11/S/Q/T', '12/T/V/M', '14/T/Q/P', '17/R/N/L', '18/T/W/N']

In [None]:
lst = []
for i in range(122):
    lst.append(f'{i:03d}.coastal_blue')
    lst.append(f'{i:03d}.blue')
    lst.append(f'{i:03d}.green')
    lst.append(f'{i:03d}.red')
    lst.append(f'{i:03d}.nir')
    lst.append(f'{i:03d}.swir1')
    lst.append(f'{i:03d}.swir2')
    lst.append(f'{i:03d}.qa')

In [None]:
col_val = ['col','row','image_year']+lst+['total_n']
width = len(col_val) # tile_id and TVT
scale = 10
divide = 5
height = 3660*3660//scale//scale*2*15
total_matrix = np.zeros((height, width), dtype=np.int32)
year = 2021

In [None]:
for j in trange(5):
    train_cube_cr = np.indices((3660,3660)).transpose(1,2,0)[[i for i in range(divide, 3660, scale)]][:,[i for i in range(divide, 3660, scale)]].reshape(-1,2)
    vt_cube_cr = np.indices((3660,3660)).transpose(1,2,0)[[i for i in range(divide, 3660, scale)]][:,[i for i in range(divide, 3660, scale)]].reshape(-1,2)
    train_cube1, train1_total_n = fuse_HLS(dates, year, scale, train_tiles[j], divide)
    val_cube1, val1_total_n = fuse_HLS(dates, year, scale, train_tiles[j], 0)
    train_cube2, train2_total_n = fuse_HLS(dates, year, scale, valid_tiles[j], divide)
    val_cube2, val2_total_n = fuse_HLS(dates, year, scale, valid_tiles[j], 0)
    train_cube3, train3_total_n = fuse_HLS(dates, year, scale, test_tiles[j], divide)
    test_cube, test_total_n = fuse_HLS(dates, year, scale, test_tiles[j], 0)
    
    # col & row
    total_matrix[(j*6+0)*3660*3660//scale//scale: (j*6+3)*3660*3660//scale//scale, 0:2] = np.concatenate((train_cube_cr,train_cube_cr,train_cube_cr))
    total_matrix[(j*6+3)*3660*3660//scale//scale: (j*6+6)*3660*3660//scale//scale, 0:2] = np.concatenate((vt_cube_cr,vt_cube_cr,train_cube_cr))
    # img_yr
    total_matrix[(j*6+0)*3660*3660//scale//scale: (j*6+6)*3660*3660//scale//scale, 2] = year
    # band
    total_matrix[(j*6+0)*3660*3660//scale//scale: (j*6+1)*3660*3660//scale//scale, 3:-1] = train_cube1.transpose(1,2,0,3).reshape(-1,8*122)
    total_matrix[(j*6+1)*3660*3660//scale//scale: (j*6+2)*3660*3660//scale//scale, 3:-1] = train_cube2.transpose(1,2,0,3).reshape(-1,8*122)
    total_matrix[(j*6+2)*3660*3660//scale//scale: (j*6+3)*3660*3660//scale//scale, 3:-1] = train_cube3.transpose(1,2,0,3).reshape(-1,8*122)
    total_matrix[(j*6+3)*3660*3660//scale//scale: (j*6+4)*3660*3660//scale//scale, 3:-1] = val_cube1.transpose(1,2,0,3).reshape(-1,8*122)
    total_matrix[(j*6+4)*3660*3660//scale//scale: (j*6+5)*3660*3660//scale//scale, 3:-1] = val_cube2.transpose(1,2,0,3).reshape(-1,8*122)
    total_matrix[(j*6+5)*3660*3660//scale//scale: (j*6+6)*3660*3660//scale//scale, 3:-1] = test_cube.transpose(1,2,0,3).reshape(-1,8*122)
    # total_n
    total_matrix[(j*6+0)*3660*3660//scale//scale: (j*6+1)*3660*3660//scale//scale, -1] = train1_total_n
    total_matrix[(j*6+1)*3660*3660//scale//scale: (j*6+2)*3660*3660//scale//scale, -1] = train2_total_n
    total_matrix[(j*6+2)*3660*3660//scale//scale: (j*6+3)*3660*3660//scale//scale, -1] = train3_total_n
    total_matrix[(j*6+3)*3660*3660//scale//scale: (j*6+4)*3660*3660//scale//scale, -1] = val1_total_n
    total_matrix[(j*6+4)*3660*3660//scale//scale: (j*6+5)*3660*3660//scale//scale, -1] = val2_total_n
    total_matrix[(j*6+5)*3660*3660//scale//scale: (j*6+6)*3660*3660//scale//scale, -1] = test_total_n

In [None]:
df = pd.DataFrame(total_matrix, columns=col_val)

In [None]:
df['tile_id'] = pd.Series(dtype='string')
df['TVT'] = pd.Series(dtype='string')
df['lat'] = pd.Series(dtype='float32')
df['lon'] = pd.Series(dtype='float32')
for i in trange(5):
    # tile_id
    df.loc[(i*6+0)*3660*3660//scale//scale: (i*6+1)*3660*3660//scale//scale,'tile_id'] = f'{train_tiles[i][:2]+train_tiles[i][3]+train_tiles[i][5]+train_tiles[i][7]}'
    df.loc[(i*6+1)*3660*3660//scale//scale: (i*6+2)*3660*3660//scale//scale,'tile_id'] = f'{valid_tiles[i][:2]+valid_tiles[i][3]+valid_tiles[i][5]+valid_tiles[i][7]}'
    df.loc[(i*6+2)*3660*3660//scale//scale: (i*6+3)*3660*3660//scale//scale,'tile_id'] = f'{test_tiles[i][:2]+test_tiles[i][3]+test_tiles[i][5]+test_tiles[i][7]}'
    df.loc[(i*6+3)*3660*3660//scale//scale: (i*6+4)*3660*3660//scale//scale,'tile_id'] = f'{train_tiles[i][:2]+train_tiles[i][3]+train_tiles[i][5]+train_tiles[i][7]}'
    df.loc[(i*6+4)*3660*3660//scale//scale: (i*6+5)*3660*3660//scale//scale,'tile_id'] = f'{valid_tiles[i][:2]+valid_tiles[i][3]+valid_tiles[i][5]+valid_tiles[i][7]}'
    df.loc[(i*6+5)*3660*3660//scale//scale: (i*6+6)*3660*3660//scale//scale,'tile_id'] = f'{test_tiles[i][:2]+test_tiles[i][3]+test_tiles[i][5]+test_tiles[i][7]}'
    
    # TVT
    df.loc[(i*6+0)*3660*3660//scale//scale: (i*6+1)*3660*3660//scale//scale,'TVT'] = f'train'
    df.loc[(i*6+1)*3660*3660//scale//scale: (i*6+2)*3660*3660//scale//scale,'TVT'] = f'train'
    df.loc[(i*6+2)*3660*3660//scale//scale: (i*6+3)*3660*3660//scale//scale,'TVT'] = f'train'
    df.loc[(i*6+3)*3660*3660//scale//scale: (i*6+4)*3660*3660//scale//scale,'TVT'] = f'valid'
    df.loc[(i*6+4)*3660*3660//scale//scale: (i*6+5)*3660*3660//scale//scale,'TVT'] = f'valid'
    df.loc[(i*6+5)*3660*3660//scale//scale: (i*6+6)*3660*3660//scale//scale,'TVT'] = f'test'

In [None]:
df.to_csv(f'F:/dataset.csv', index=False, compression='gzip')