# Import library

In [None]:
import numpy as np
import pandas as pd

from  datetime import timedelta
import warnings
warnings.filterwarnings("ignore")

# Define function

In [None]:
def preprocess_cloud_df(df:pd.DataFrame) -> pd.DataFrame :
    """
    Process data calculated from matlab with correted datetime and timezone UTC+7  
    
    Args :
    
        df : data from matlab that the first column is image file names and others are cloud information 
        
    Returns :
    
        processed DataFrame for analysis
    """
    
    _df = df.copy()
    
    # Step 0 fill NaN with 0
    _df = _df.fillna(0)
    
    # Step 1 extract time information from file names
    
    _df['year'] = _df['filename'].str[-17:-13]
    _df['month'] = _df['filename'].str[-13:-11]
    _df['day'] = _df['filename'].str[-11:-9]
    _df['hour'] = _df['filename'].str[-9:-7]
    _df['minute'] = _df['filename'].str[-7:-5]
    
    # Step 2 create Datetime column and keep only 2 rows data that differencs time equls 10 minutes
    
    _df['Datetime'] = _df['year'] + '-' + _df['month'] + '-' + _df['day'] + ' ' + _df['hour'] + ':' + _df['minute'] + ':00'        
    _df['Datetime'] = pd.to_datetime(_df['Datetime'])
    _df['prev_datetime'] = _df['Datetime'].shift(1)
    _df['difference_time'] = _df['Datetime'] - _df['prev_datetime'] 
    _df = _df[_df['difference_time'] == timedelta(days= 0, hours = 0, minutes = 10)]
    
    # Step 3 add 7 hours to convert datetime from UTC+0 to UTC+7
    
    _df['Datetime'] = _df['Datetime'] + timedelta(hours=7)
    
    # Step 4 drop redundance columns
    
    _df = _df.drop(columns = ['filename', 'month', 'year', 'day', 'hour', 'minute', 'prev_datetime', 'difference_time'])

    return _df
def map_all_ci(df1, df2, df3, df4, df5, df6, df7, df8):
    all_ci_df = pd.DataFrame()
    
    for site_no in range(1, 57):
        site_no_str = f'site{("00" + str(site_no))[-3:]}'
        site_df = center_df[['Datetime', site_no_str]]
        site_df.rename(columns={site_no_str : 'ci_center'}, inplace=True)
        
#         site_df['Datetime'] = site_df['Datetime'].astype('datetime64[ns]')
        ci_1step_df = df1[['Datetime', site_no_str]]
        ci_1step_df.rename(columns={ site_no_str: 'ci_est(t+1)'}, inplace=True)
        ci_2step_df = df2[['Datetime', site_no_str]]
        ci_2step_df.rename(columns={ site_no_str: 'ci_est(t+2)'}, inplace=True)
        ci_3step_df = df3[['Datetime', site_no_str]]
        ci_3step_df.rename(columns={ site_no_str: 'ci_est(t+3)'}, inplace=True)
        ci_4step_df = df4[['Datetime', site_no_str]]
        ci_4step_df.rename(columns={ site_no_str: 'ci_est(t+4)'}, inplace=True)
        ci_5step_df = df5[['Datetime', site_no_str]]
        ci_5step_df.rename(columns={ site_no_str: 'ci_est(t+5)'}, inplace=True)
        ci_6step_df = df6[['Datetime', site_no_str]]
        ci_6step_df.rename(columns={ site_no_str: 'ci_est(t+6)'}, inplace=True)
        ci_7step_df = df7[['Datetime', site_no_str]]
        ci_7step_df.rename(columns={ site_no_str: 'ci_est(t+7)'}, inplace=True)
        ci_8step_df = df8[['Datetime', site_no_str]]
        ci_8step_df.rename(columns={ site_no_str: 'ci_est(t+8)'}, inplace=True)

        site_df = pd.merge(site_df, ci_1step_df, on='Datetime', how='inner')
        site_df = pd.merge(site_df, ci_2step_df, on='Datetime', how='inner')
        site_df = pd.merge(site_df, ci_3step_df, on='Datetime', how='inner')
        site_df = pd.merge(site_df, ci_4step_df, on='Datetime', how='inner')
        site_df = pd.merge(site_df, ci_5step_df, on='Datetime', how='inner')
        site_df = pd.merge(site_df, ci_6step_df, on='Datetime', how='inner')
        site_df = pd.merge(site_df, ci_7step_df, on='Datetime', how='inner')
        site_df = pd.merge(site_df, ci_8step_df, on='Datetime', how='inner')

        site_df['Datetime'] = pd.to_datetime(site_df['Datetime'])
        site_df = site_df.set_index('Datetime')
        for i in range(1, 9):
            site_df[f'ci_center_lead{i}step'] = site_df['ci_center'].shift(-30*i, freq='min')
    
        site_df = site_df.resample('30min').asfreq()
        site_df = site_df.reset_index()
        site_df['site'] = site_no
        all_ci_df = pd.concat([all_ci_df, site_df], ignore_index=True)
    return all_ci_df

# Process data

In [None]:
center_df = pd.read_csv('ci_center_all_sites_processed.csv')
df1step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_1_step.xlsx')
df2step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_2_step.xlsx')
df3step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_3_step.xlsx')
df4step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_4_step.xlsx')
df5step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_5_step.xlsx')
df6step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_6_step.xlsx')
df7step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_7_step.xlsx')
df8step11 =pd.read_excel('BlockMatching_Interpolate_W11_CI_8_step.xlsx')

df1step11 = preprocess_cloud_df(df1step11)
df2step11 = preprocess_cloud_df(df2step11)
df3step11 = preprocess_cloud_df(df3step11)
df4step11 = preprocess_cloud_df(df4step11)
df5step11 = preprocess_cloud_df(df5step11)
df6step11 = preprocess_cloud_df(df6step11)
df7step11 = preprocess_cloud_df(df7step11)
df8step11 = preprocess_cloud_df(df8step11)

In [None]:
df1step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_1_step.xlsx')
df2step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_2_step.xlsx')
df3step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_3_step.xlsx')
df4step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_4_step.xlsx')
df5step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_5_step.xlsx')
df6step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_6_step.xlsx')
df7step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_7_step.xlsx')
df8step33 =pd.read_excel('BlockMatching_Interpolate_W33_CI_8_step.xlsx')

df1step33 = preprocess_cloud_df(df1step33)
df2step33 = preprocess_cloud_df(df2step33)
df3step33 = preprocess_cloud_df(df3step33)
df4step33 = preprocess_cloud_df(df4step33)
df5step33 = preprocess_cloud_df(df5step33)
df6step33 = preprocess_cloud_df(df6step33)
df7step33 = preprocess_cloud_df(df7step33)
df8step33 = preprocess_cloud_df(df8step33)

In [None]:
df1step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_1_step.xlsx')
df2step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_2_step.xlsx')
df3step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_3_step.xlsx')
df4step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_4_step.xlsx')
df5step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_5_step.xlsx')
df6step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_6_step.xlsx')
df7step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_7_step.xlsx')
df8step55 =pd.read_excel('BlockMatching_Interpolate_W55_CI_8_step.xlsx')

df1step55 = preprocess_cloud_df(df1step55)
df2step55 = preprocess_cloud_df(df2step55)
df3step55 = preprocess_cloud_df(df3step55)
df4step55 = preprocess_cloud_df(df4step55)
df5step55 = preprocess_cloud_df(df5step55)
df6step55 = preprocess_cloud_df(df6step55)
df7step55 = preprocess_cloud_df(df7step55)
df8step55 = preprocess_cloud_df(df8step55)

In [None]:
df1step1e4 =pd.read_excel('HS_CI_1_step_1E-4.xlsx')
df2step1e4 =pd.read_excel('HS_CI_2_step_1E-4.xlsx')
df3step1e4 =pd.read_excel('HS_CI_3_step_1E-4.xlsx')
df4step1e4 =pd.read_excel('HS_CI_4_step_1E-4.xlsx')
df5step1e4 =pd.read_excel('HS_CI_5_step_1E-4.xlsx')
df6step1e4 =pd.read_excel('HS_CI_6_step_1E-4.xlsx')
df7step1e4 =pd.read_excel('HS_CI_7_step_1E-4.xlsx')
df8step1e4 =pd.read_excel('HS_CI_8_step_1E-4.xlsx')


df1step1e4 = preprocess_cloud_df(df1step1e4)
df2step1e4 = preprocess_cloud_df(df2step1e4)
df3step1e4 = preprocess_cloud_df(df3step1e4)
df4step1e4 = preprocess_cloud_df(df4step1e4)
df5step1e4 = preprocess_cloud_df(df5step1e4)
df6step1e4 = preprocess_cloud_df(df6step1e4)
df7step1e4 = preprocess_cloud_df(df7step1e4)
df8step1e4 = preprocess_cloud_df(df8step1e4)

In [None]:
df1step1e3 =pd.read_excel('HS_CI_1_step_1E-3.xlsx')
df2step1e3 =pd.read_excel('HS_CI_2_step_1E-3.xlsx')
df3step1e3 =pd.read_excel('HS_CI_3_step_1E-3.xlsx')
df4step1e3 =pd.read_excel('HS_CI_4_step_1E-3.xlsx')
df5step1e3 =pd.read_excel('HS_CI_5_step_1E-3.xlsx')
df6step1e3 =pd.read_excel('HS_CI_6_step_1E-3.xlsx')
df7step1e3 =pd.read_excel('HS_CI_7_step_1E-3.xlsx')
df8step1e3 =pd.read_excel('HS_CI_8_step_1E-3.xlsx')

df1step1e3 = preprocess_cloud_df(df1step1e3)
df2step1e3 = preprocess_cloud_df(df2step1e3)
df3step1e3 = preprocess_cloud_df(df3step1e3)
df4step1e3 = preprocess_cloud_df(df4step1e3)
df5step1e3 = preprocess_cloud_df(df5step1e3)
df6step1e3 = preprocess_cloud_df(df6step1e3)
df7step1e3 = preprocess_cloud_df(df7step1e3)
df8step1e3 = preprocess_cloud_df(df8step1e3)

In [None]:
df1step002 =pd.read_excel('HS_CI_1_step_002.xlsx')
df2step002 =pd.read_excel('HS_CI_2_step_002.xlsx')
df3step002 =pd.read_excel('HS_CI_3_step_002.xlsx')
df4step002 =pd.read_excel('HS_CI_4_step_002.xlsx')
df5step002 =pd.read_excel('HS_CI_5_step_002.xlsx')
df6step002 =pd.read_excel('HS_CI_6_step_002.xlsx')
df7step002 =pd.read_excel('HS_CI_7_step_002.xlsx')
df8step002 =pd.read_excel('HS_CI_8_step_002.xlsx')

df1step002 = preprocess_cloud_df(df1step002)
df2step002 = preprocess_cloud_df(df2step002)
df3step002 = preprocess_cloud_df(df3step002)
df4step002 = preprocess_cloud_df(df4step002)
df5step002 = preprocess_cloud_df(df5step002)
df6step002 = preprocess_cloud_df(df6step002)
df7step002 = preprocess_cloud_df(df7step002)
df8step002 = preprocess_cloud_df(df8step002)

In [None]:
df1step02 =pd.read_excel('HS_CI_1_step_02.xlsx')
df2step02 =pd.read_excel('HS_CI_2_step_02.xlsx')
df3step02 =pd.read_excel('HS_CI_3_step_02.xlsx')
df4step02 =pd.read_excel('HS_CI_4_step_02.xlsx')
df5step02 =pd.read_excel('HS_CI_5_step_02.xlsx')
df6step02 =pd.read_excel('HS_CI_6_step_02.xlsx')
df7step02 =pd.read_excel('HS_CI_7_step_02.xlsx')
df8step02 =pd.read_excel('HS_CI_8_step_02.xlsx')

df1step02 = preprocess_cloud_df(df1step02)
df2step02 = preprocess_cloud_df(df2step02)
df3step02 = preprocess_cloud_df(df3step02)
df4step02 = preprocess_cloud_df(df4step02)
df5step02 = preprocess_cloud_df(df5step02)
df6step02 = preprocess_cloud_df(df6step02)
df7step02 = preprocess_cloud_df(df7step02)
df8step02 = preprocess_cloud_df(df8step02)

In [None]:
df1step1 =pd.read_excel('HS_CI_1_step_L1.xlsx')
df2step1 =pd.read_excel('HS_CI_2_step_L1.xlsx')
df3step1 =pd.read_excel('HS_CI_3_step_L1.xlsx')
df4step1 =pd.read_excel('HS_CI_4_step_L1.xlsx')
df5step1 =pd.read_excel('HS_CI_5_step_L1.xlsx')
df6step1 =pd.read_excel('HS_CI_6_step_L1.xlsx')
df7step1 =pd.read_excel('HS_CI_7_step_L1.xlsx')
df8step1 =pd.read_excel('HS_CI_8_step_L1.xlsx')

df1step1 = preprocess_cloud_df(df1step1)
df2step1 = preprocess_cloud_df(df2step1)
df3step1 = preprocess_cloud_df(df3step1)
df4step1 = preprocess_cloud_df(df4step1)
df5step1 = preprocess_cloud_df(df5step1)
df6step1 = preprocess_cloud_df(df6step1)
df7step1 = preprocess_cloud_df(df7step1)
df8step1 = preprocess_cloud_df(df8step1)

In [None]:
df1step10 =pd.read_excel('HS_CI_1_step_10.xlsx')
df2step10 =pd.read_excel('HS_CI_2_step_10.xlsx')
df3step10 =pd.read_excel('HS_CI_3_step_10.xlsx')
df4step10 =pd.read_excel('HS_CI_4_step_10.xlsx')
df5step10 =pd.read_excel('HS_CI_5_step_10.xlsx')
df6step10 =pd.read_excel('HS_CI_6_step_10.xlsx')
df7step10 =pd.read_excel('HS_CI_7_step_10.xlsx')
df8step10 =pd.read_excel('HS_CI_8_step_10.xlsx')

df1step10 = preprocess_cloud_df(df1step10)
df2step10 = preprocess_cloud_df(df2step10)
df3step10 = preprocess_cloud_df(df3step10)
df4step10 = preprocess_cloud_df(df4step10)
df5step10 = preprocess_cloud_df(df5step10)
df6step10 = preprocess_cloud_df(df6step10)
df7step10 = preprocess_cloud_df(df7step10)
df8step10 = preprocess_cloud_df(df8step10)

In [None]:
df1stepFRB = pd.read_csv('FRB_CI_1_step.csv')
df2stepFRB = pd.read_csv('FRB_CI_2_step.csv')
df3stepFRB = pd.read_csv('FRB_CI_3_step.csv')
df4stepFRB = pd.read_csv('FRB_CI_4_step.csv')
df5stepFRB = pd.read_csv('FRB_CI_5_step.csv')
df6stepFRB = pd.read_csv('FRB_CI_6_step.csv')
df7stepFRB = pd.read_csv('FRB_CI_7_step.csv')
df8stepFRB = pd.read_csv('FRB_CI_8_step.csv')


In [None]:
center_df = pd.read_csv('ci_center_all_sites_processed.csv')
mask = (center_df.drop(columns={'Datetime'}) == 0).all(axis=1)
center_df = center_df[~mask]
center_df

In [None]:
all_ci_df11 = map_all_ci(df1step11, df2step11, df3step11, df4step11,df5step11, df6step11, df7step11, df8step11)
all_ci_df33 = map_all_ci(df1step33, df2step33, df3step33, df4step33,df5step33, df6step33, df7step33, df8step33)
all_ci_df55 = map_all_ci(df1step55, df2step55, df3step55, df4step55,df5step55, df6step55, df7step55, df8step55)
all_ci_df1e4 = map_all_ci(df1step1e4, df2step1e4, df3step1e4, df4step1e4,df5step1e4, df6step1e4, df7step1e4, df8step1e4)
all_ci_df1e3 = map_all_ci(df1step1e3, df2step1e3, df3step1e3, df4step1e3,df5step1e3, df6step1e3, df7step1e3, df8step1e3)
all_ci_df002 = map_all_ci(df1step002, df2step002, df3step002, df4step002,df5step002, df6step002, df7step002, df8step002)
all_ci_df02 = map_all_ci(df1step02, df2step02, df3step02, df4step02,df5step02, df6step02, df7step02, df8step02)
all_ci_df1 = map_all_ci(df1step1, df2step1, df3step1, df4step1,df5step1, df6step1, df7step1, df8step1)
all_ci_df10 = map_all_ci(df1step10, df2step10, df3step10, df4step10,df5step10, df6step10, df7step10, df8step10)
all_ci_dfFRB = map_all_ci(df1stepFRB, df2stepFRB, df3stepFRB, df4stepFRB,df5stepFRB, df6stepFRB, df7stepFRB, df8stepFRB)