In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from fttransformer.utils.preprocessing import df_to_dataset, build_categorical_prep
from fttransformer.models.fttransformer import FTTransformerEncoder, FTTransformer, MLPBlock
import tensorflow as tf
import keras_tuner 

Using TensorFlow backend


In [1]:
DATA_FOLDER = '../processed-datasets/'
UPSTREAM_TRAIN_FILENAME = 'upstream_train.csv'
UPSTREAM_VAL_FILENAME = 'upstream_val.csv'
DOWNSTREAM_TRAIN_FILENAME = 'downstream_train.csv'
DOWNSTREAM_TEST_FILENAME = 'downstream_test.csv'

MODEL_FOLDER = '../model/'

UPSTREAM_TRAINING_DIR = f'{MODEL_FOLDER}upstream-training/'
UPSTREAM_FINETUNE_DIR = f'{MODEL_FOLDER}upstream-finetune/'
UPSTREAM_FINETUNE_OPTUNA_DIR = f'{MODEL_FOLDER}upstream-finetune-optuna/'
DOWNSTREAM_TRAINING_DIR = f'{MODEL_FOLDER}downstream-train/'
DOWNSTREAM_E2E_DIR = f'{MODEL_FOLDER}downstream-e2e/'
DOWNSTREAM_FINETUNE_DIR = f'{MODEL_FOLDER}downstream-finetune-from-scratch/'
DOWNSTREAM_PARTIAL_TUNE_DIR = f'{MODEL_FOLDER}downstream-partial-tune/'

MODEL_RESULT_PLT_DIR = '../model_result_plt/'

In [4]:
tf.get_logger().setLevel('ERROR')

In [9]:
finetune_objective = [
    keras_tuner.Objective('val_output_root_mean_squared_error', "min")]

In [None]:
def embed_catgorical_features(df, categorical_columns):
    df[categorical_columns] = df[categorical_columns].astype(str)
    return df

In [None]:
def compile_model(
    df_dataset,
    d_embedding:int, 
    n_layers:int, 
    ffn_factor:float, 
    attention_dropout:float, 
    ffn_dropout:float, 
    residual_dropout:float, 
    weight_decay:float, 
    lr:float,
    numerical_embedding_type='linear',
    mlp_head_unit_1=32,
    mlp_head_unit_2=32,
    out_activation='linear'
    ):
    
    head = MLPBlock(
        mlp_head_unit_1,
        mlp_head_unit_2,
        1,
        out_activation
    )
    ft_linear_encoder = FTTransformerEncoder(
        numerical_features=NUMERIC_FEATURES,  # list of numeric features
        categorical_features=CATEGORICAL_FEATURES,  # list of numeric features
        numerical_data=df_dataset[NUMERIC_FEATURES].values,
        categorical_data=df_dataset[CATEGORICAL_FEATURES].values,
        y = None,
        numerical_embedding_type=numerical_embedding_type,
        embedding_dim=d_embedding,
        depth=n_layers,
        ffn_factor=ffn_factor,
        attn_dropout=attention_dropout,
        ff_dropout=ffn_dropout,
        residual_dropout=residual_dropout,
        explainable=True,
    )

    ft_model = FTTransformer(
        encoder=ft_linear_encoder,  # Encoder from above
        out_dim=1,  # Number of outputs in final layer
        head=head
    )
    
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=lr, weight_decay=weight_decay
    )

    ft_model.compile(
        optimizer = optimizer,
        loss = tf.keras.losses.MeanSquaredError(),
        metrics= [tf.keras.metrics.RootMeanSquaredError()],
    )
    
    return ft_model

In [5]:
def get_random_search_instance(objective, save_dir, overwrite=False, hypermodel=None, max_trials=30):
    metric_name = ''
    if isinstance(objective, str):
        metric_name = objective
    else:
        for i, objective_tune in enumerate(objective):
            if i != 0:
                metric_name += '_'
            metric_name += objective_tune.name
    return keras_tuner.RandomSearch(
        hypermodel=hypermodel,
        objective=objective,
        max_trials=max_trials,
        overwrite=overwrite,
        directory=save_dir,
        project_name=f"finetune_progress",
    )

In [None]:
def load_best_model(objective, save_dir, hypermodel):
    tuner = get_random_search_instance(objective, save_dir, False, hypermodel)
    models = tuner.get_best_models()
    best_model = models[0]
    best_model.build()
    best_model.summary()
    return best_model

In [None]:
def get_predict_result(model, predict_input_df, actual_df):
    test_df = df_to_dataset(predict_input_df, shuffle=False, batch_size=1)
    predict_result = model.predict(test_df)
    linear_rms = mean_squared_error(actual_df, predict_result['output'].ravel(), squared=False)
    print("Linear Encoding RMSE:", linear_rms.round(4))
    return predict_result

In [None]:
def get_modis_compare_data(predict_df):
    fluxnet_modis_test_df = pd.read_csv(f'{DATA_FOLDER}compare_modis_test.csv')
    predict_df = pd.DataFrame({TARGET_FEATURE:predict_df})
    fluxnet_modis_test_df.update(predict_df, filter_func=lambda df: df>60000)
    return fluxnet_modis_test_df

In [None]:
def plot_result_distribution(actual_df, predict_df, compare_df, save_dir=None, title=None):
    plt.plot(actual_df, actual_df, color='red')
    plt.plot(actual_df, predict_df, 'o', color='c', label='prediction')
    plt.plot(actual_df, compare_df, 'o', color='orange', label='MOD17', alpha=0.7)
    plt.xlabel("FLUXNET GPP")
    plt.ylabel("Estimated GPP")
    if title:
        plt.title(title)
    plt.legend()
    if save_dir:
        plt.savefig(save_dir)
    plt.show()    
    

In [None]:
def get_input_tensors(tf_dataset):
    input_tensors = {}
    for key,value in tf_dataset.element_spec[0].items():
        input_tensors[key] = tf.keras.layers.Input(name=key, type_spec = value)
        
    return input_tensors

In [None]:
def get_rmse_on_upstream_val(model, upstream_val_df):
    test_df = df_to_dataset(upstream_val_df[FEATURES], shuffle=False, batch_size=1)
    linear_test_preds = model.predict(test_df)    
    me = mean_squared_error(upstream_val_df[TARGET_FEATURE], linear_test_preds['output'].ravel(), squared=False).round(4)
    print(f'RMSE on upstream validation: {me}')
    return me

In [None]:
def get_all_metrics(predict_df, modis_df, actual_df):
    get_rmse_metrics(predict_df, modis_df, actual_df)    
    get_r2_metrics(predict_df, modis_df, actual_df)    
    get_max_error_metrics(predict_df, modis_df, actual_df) 
    contain_negative = False
    if np.any(predict_df < 0):
        contain_negative = True
        print(f'predict_df contains negative values, skip mean_poisson_deviance metrics')
    elif np.any(modis_df < 0):
        contain_negative = True
        print(f'modis_df contains negative values, skip mean_poisson_deviance metrics')
    elif np.any(actual_df < 0):
        contain_negative = True
        print(f'actual_df contains negative values, skip mean_poisson_deviance metrics')
    if not contain_negative:
        get_mean_poisson_deviance_metrics(predict_df, modis_df, actual_df)
        get_mean_gamma_deviance_metrics(predict_df, modis_df, actual_df)
    get_mean_pinball_loss_metrics(predict_df, modis_df, actual_df)
    

In [None]:
from sklearn.metrics import mean_squared_error

# def get_modis_result(modis_df, actual_df):
def get_rmse_metrics(predict_df, modis_df, actual_df):
    me = mean_squared_error(actual_df, predict_df, squared=False).round(4)
    print(f'RMSE prediction: {me}')
    print(f'RMSE MOD17: {mean_squared_error(actual_df, modis_df, squared=False).round(4)}')
    return me    

In [None]:
from sklearn.metrics import r2_score

# def get_r2_compare_data(predict_df, modis_df, actual_df):
def get_r2_metrics(predict_df, modis_df, actual_df):
    me = r2_score(actual_df, predict_df).round(4)
    print(f'R-Squared prediction: {me}')
    print(f'R-Squared MOD17: {r2_score(actual_df, modis_df)}')
    return me    

In [None]:
from sklearn.metrics import max_error

def get_max_error_metrics(predict_df, modis_df, actual_df):
    me = max_error(actual_df, predict_df).round(4)
    print(f'Max error prediction: {me}')
    print(f'Max error MOD17: {max_error(actual_df, modis_df)}')
    return me    

In [None]:
from sklearn.metrics import mean_poisson_deviance

# equivalent: mean_tweedie_deviance when power=1
def get_mean_poisson_deviance_metrics(predict_df, modis_df, actual_df):
    me = mean_poisson_deviance(actual_df, predict_df).round(4)
    print(f'Mean poisson deviance prediction: {me}')
    print(f'Mean poisson deviance MOD17: {mean_poisson_deviance(actual_df, modis_df)}')
    return me    

In [None]:
from sklearn.metrics import mean_gamma_deviance

# equivalent: mean_tweedie_deviance when power=2
def get_mean_gamma_deviance_metrics(predict_df, modis_df, actual_df):
    me = mean_gamma_deviance(actual_df, predict_df).round(4)
    print(f'Mean gamma deviance prediction: {me}')
    print(f'Mean gamma deviance MOD17: {mean_gamma_deviance(actual_df, modis_df)}')
    return me

In [None]:
from sklearn.metrics import mean_pinball_loss

def get_mean_pinball_loss_metrics(predict_df, modis_df, actual_df, alpha=0.9):
    print(f'Mean pinball loss with alpha = {alpha} prediction: {mean_pinball_loss(actual_df, predict_df, alpha=alpha)}')
    print(f'Mean pinball loss with alpha = {alpha} MOD17: {mean_pinball_loss(actual_df, modis_df, alpha=alpha)}')
    print(f'Mean pinball loss with alpha = {1-alpha} prediction: {mean_pinball_loss(actual_df, predict_df, alpha=1-alpha)}')
    print(f'Mean pinball loss with alpha = {1-alpha} MOD17: {mean_pinball_loss(actual_df, modis_df, alpha=1-alpha)}')