In [1]:
import os
import pandas as pd
import sys
sys.path.append('../')
import utils.csv as csv

In [8]:
PATH='D:\\Deutschland\\FUB\\master_thesis\\data\\gee'
INPUT_DIR = os.path.join(PATH,'validation_grid_20m_cloud10')
OUTPUT_DIR = os.path.join(PATH, 'output')
DATA_DIR = os.path.join(OUTPUT_DIR, 'validation_grid_daily_padding')

DATE_CSV = 'occurrence_val_grid.csv'
MERGE_CSV = 'merged_val_grid.csv'

date_path = os.path.join(OUTPUT_DIR, DATE_CSV)
merge_path = os.path.join(OUTPUT_DIR, MERGE_CSV)

files = os.listdir(INPUT_DIR)

1 Functions for Pandas DataFrame

In [3]:
def shuffle(df:pd.DataFrame, ref:pd.DataFrame) -> pd.DataFrame:
    """Pre-processing of each input time series data"""
    df = df[~df.index.duplicated(keep='last')]
    df = pd.merge(df, ref, how='right', on='date')
    df.drop(columns=['count','spacecraft_id', 'id'], inplace=True)
    return df

In [4]:
def zero_padding(df:pd.DataFrame) -> pd.DataFrame:
    """Padding all NaN with 0"""
    df.fillna(0,inplace=True)
    return df

In [5]:
def monthly_mean_interpolate(df:pd.DataFrame) -> pd.DataFrame:
    """Calculate mean band value in vegetation months for each year"""
    df.interpolate(method='time', inplace=True)
    mean_df = df.resample('M').mean()
    mean_df = mean_df[((mean_df.index.month >= 5) & (mean_df.index.month <=9))]
    # cols = list(mean_df.keys())
    # for col in cols:
    #     mean_df[col] = mean_df[col].fillna(mean_df.groupby(mean_df.index.month)[col].transform('mean'))
    return mean_df

In [6]:
def reshape(df:pd.DataFrame) -> pd.DataFrame:
    """Turn all combination of reflectance value and date into one row"""
    # new keys and values
    keys = list(df.keys())
    data = {'id':int(df.iat[0, -1])}
    for index, row in df.iterrows():
        date = index.strftime('%Y%m%d')
        for key in keys[:-1]:
            column = f'{date} {key}'
            data[column] = [row[key]]
    # reshape data
    return pd.DataFrame(data)

2 Count all available dates among all polygons

In [7]:
def rename_file() -> None:
    tmp_dir = os.path.join(PATH,'tmp')
    for file in files:
        if file.endswith(".csv"):
            in_path = os.path.join(INPUT_DIR, file)
            try:
                df = csv.load(in_path, 'date', True)
                if df.empty:
                    continue
            except Exception:
                continue
            id = df.iloc[0, 11]
            out_path = os.path.join(tmp_dir, f'plot_{id}.csv')
            csv.export(df, out_path, True)
rename_file()

export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_5393.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_5395.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_5239.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_10845.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_8928.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6896.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6899.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6901.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6900.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6903.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6902.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6889.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_6888.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\tmp\plot_

In [9]:
def count_dates() -> pd.DataFrame:
    """Count all available dates among all polygons"""
    map = {}
    # read each csv file
    for file in files:
        if file.endswith(".csv"):
            in_path = os.path.join(INPUT_DIR, file)
            df = csv.load(in_path, 'date', True)
            # count date occurrence
            for index, row in df.iterrows():
                date = index.strftime('%Y%m%d')
                map[date] = map.get(date, 0) + 1
    # export output as csv
    dates = list(map.keys())
    counts = list(map.values())
    output = pd.DataFrame({'date':dates, 'count':counts})
    output.sort_values(by='date', ascending=True, inplace=True)
    csv.export(output, date_path, index=False)
    return output
count_dates()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\occurrence_val_grid.csv


Unnamed: 0,date,count
56,20170410,8768
112,20170420,5826
0,20170430,17772
1,20170510,17772
57,20170527,5293
...,...,...
115,20210901,6676
107,20210903,5293
54,20210906,17772
108,20210913,5262


3 Merge all data frames to one csv file

In [11]:
def merge_data_frame() -> pd.DataFrame:
    """Merge all data frames to one csv file"""
    dates = csv.load(date_path, 'date', True)
    merged = pd.DataFrame(columns=['id'])
    # add each csv file to input list as data frame
    for file in files:
        if file.endswith(".csv"):
            in_path = os.path.join(INPUT_DIR, file)
            df = csv.load(in_path, 'date', True)
            id = df.iloc[0, 11]
            merged.loc[len(merged.index)] = [id]
            # pre-process
            df = shuffle(df, dates)
            df = zero_padding(df)
            # export each new csv file
            out_path = os.path.join(DATA_DIR, file[5:])
            csv.export(df, out_path, True)
    csv.export(merged, merge_path, False)
    return merged
merge_data_frame()

export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10000.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10001.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10002.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10003.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10004.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10005.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10006.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10009.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\10010.csv
export file D:\Deutschland\FUB\master_thesis\data\gee\output\validation_grid_daily_padding\

Unnamed: 0,id
0,10000
1,10001
2,10002
3,10003
4,10004
...,...
17767,9995
17768,9996
17769,9997
17770,9998
