<h1>Chapter 1</h1>
<h2>Random forest reconstructing North / South monthly data</h2>

<h2>1. Import package and Extract data</h2>

In [None]:
# PACKAGE
import numpy as np
import pandas as pd
import datetime as dt
import netCDF4 as nc4
from sklearn.ensemble import RandomForestRegressor

# CONFIGURATION
# File directory
file_dir = "Data/" # Data to be downloaded from OzFlux data portal: https://data.ozflux.org.au/pub/viewColDetails.jspx?collection.id=1883250&collection.owner.id=768&viewType=anonymous

# Years and NetCDF filenames
years_all = range(2011, 2026)  # 2011â€“2025 inclusive
netcdf_files = [f"Gingin_{year}_L3.nc" for year in years_all]

# Time conversion (to Unix timestamp) - "days since 1800-01-01"
convert_unix_800 = 5364691200

# Random Forest (RF) settings
rf_kwargs = dict(n_estimators=1000,random_state=42,n_jobs=-1,oob_score=True,)

# Variables
target_col = "Et"
feature_cols = ["Fsd","Ws","L","sigmav","ustar","Fn","Fg","Sws_40cm","Sws_80cm",]
wind_col = "Wd"

# Year ranges
yr_north = np.arange(2011,2026) 
yr_pre_fire = np.arange(2011,2017) 
yr_fire = np.arange(2016,2018) 
yr_post_fire = np.arange(2017,2026) 
months_all = np.arange(1,13)

# EXTRACT FULL FLUX DATASET
fc_list = []

for fn in netcdf_files:
    ds = nc4.Dataset(file_dir + fn)

    # Time
    time_arr = ds["time"][:]
    time_arr = time_arr.reshape(len(time_arr), 1).filled(np.nan)

    time_df = pd.DataFrame(time_arr, columns=["datestamp_since_1800"])
    time_df["timestamp"] = time_df["datestamp_since_1800"] * 86400 - convert_unix_800

    time_df["datetime"] = time_df["timestamp"].apply(lambda x: dt.datetime.fromtimestamp(x))
    time_df["datetime"] = pd.DatetimeIndex(time_df["datetime"], tz="greenwich")
    time_df = time_df.drop(columns=["datestamp_since_1800", "timestamp"])

    # Variables
    def _safe_var(name):
        return ds[name][:].reshape(-1, 1).filled(np.nan)

    data_cols = {
        wind_col: _safe_var(wind_col),
        "Fsd": _safe_var("Fsd"),
        "Et": _safe_var("Et"),
        "Ws": _safe_var("Ws"),
        "L": _safe_var("L"),
        "sigmav": _safe_var("sigmav"),
        "ustar": _safe_var("ustar"),
        "Fn": _safe_var("Fn"),
        "Fg": _safe_var("Fg"),
        "Sws_40cm": _safe_var("Sws_40cm"),
        "Sws_80cm": _safe_var("Sws_80cm"),
    }

    data_df = pd.DataFrame(np.hstack(list(data_cols.values())),columns=list(data_cols.keys()))

    fc_i = pd.concat([time_df.reset_index(drop=True),data_df.reset_index(drop=True)], axis=1)

    fc_i = fc_i.set_index("datetime")
    fc_i["year"] = fc_i.index.year
    fc_i["month"] = fc_i.index.month
    fc_i["day"] = fc_i.index.day
    fc_i["hour"] = fc_i.index.hour

    fc_list.append(fc_i)

# Polish 
fc = pd.concat(fc_list).sort_index()

# Remove bad data in September 2012
fc_temp = fc[(fc.index.year == 2012) & (fc.index.month == 9)].copy()
cols_nan = ["Fsd","Et","Ws","L","sigmav","ustar","Fn","Fg","Sws_40cm","Sws_80cm",wind_col]
fc_temp.loc[:, cols_nan] = np.nan

fc = pd.concat([fc, fc_temp]).drop_duplicates(subset=["year","month","day","hour"], keep="last").sort_index()

<h2>2. Train RF model using each north and south measurements and make predictons for the respective areas</h2>

In [None]:
# EMPTY MONTHLY OUTPUT TABELS
def init_monthly_df(yrs):
    idx = pd.MultiIndex.from_product(
        [yrs.astype(float), months_all.astype(float)],
        names=["year","month"]
    )
    return pd.DataFrame(np.nan, index=idx, columns=[
        "sum_prediction",
        "sum_truth_train",
        "sum_truth_cannot_predict",
        "original_data_sum",
        "truth_train_data_sum",
        "truth_cannot_predict_data_sum",
        "predicted_data_sum",
    ])

fc_monthly_north = init_monthly_df(yr_north)
fc_monthly_south_pre_fire = init_monthly_df(yr_pre_fire)
fc_monthly_south_fire = init_monthly_df(yr_fire)
fc_monthly_south_post_fire = init_monthly_df(yr_post_fire)

# PARTITION LOGIC
def _partition(df, site):
    has_et = df[target_col].notna()
    good_pred = df[feature_cols].notna().all(axis=1)

    if site == "north":
        # North: Wd <= 67.5 OR Wd >= 292.5
        good_wind = (df[wind_col] <= 67.5) | (df[wind_col] >= 292.5)
    elif site == "south":
        # South: Wd between 112.5 and 247.5
        good_wind = df[wind_col].between(112.5, 247.5)
    else:
        # Fallback: no wind filtering
        good_wind = pd.Series(True, index=df.index)

    truth_train_mask = has_et & good_pred & good_wind
    truth_train = df[truth_train_mask]
    truth_cannot = df[has_et & ~truth_train_mask]
    to_predict = df[df[target_col].isna() & good_pred & good_wind]

    return truth_train, truth_cannot, to_predict

# COMBINED MONTH GROUPS
north_groups = [
    np.array([1,12]),
    np.array([2,3,4,11]),
    np.array([5,7,9]),
    np.array([6,8,10]),
]

south_groups = [
    np.array([1,2,3]),
    np.array([4,6,7,8,10,11]),
    np.array([5,9,12]),
]

# PROCESS PHASES (PRE-FIRE,FIRE,POST-FIRE)
def process_phase(fc_phase, years, groups, df_out, site):
    for group in groups:
        fc_group = fc_phase[fc_phase.index.month.isin(group)]

        # Global RF training for this group
        truth_train_all, _, _ = _partition(fc_group, site=site)

        if truth_train_all.empty:
            rf = None
            X_train = None
        else:
            X_train = truth_train_all[feature_cols].values
            y_train = truth_train_all[target_col].values
            rf = RandomForestRegressor(**rf_kwargs).fit(X_train, y_train)

        for y in years:
            fc_year = fc_group[fc_group.index.year == y]
            if fc_year.empty:
                continue

            for m in group:
                fc_month = fc_year[fc_year.index.month == m]
                if fc_month.empty:
                    continue

                truth_train, truth_cannot, to_predict = _partition(fc_month, site=site)

                idx = (float(y), float(m))

                sum_truth_train = truth_train[target_col].sum()
                sum_truth_cannot = truth_cannot[target_col].sum()
                original_sum = sum_truth_train + sum_truth_cannot

                if rf is None or X_train is None or to_predict.empty:
                    sum_pred = np.nan
                    pred_sum = np.nan
                else:
                    y_pred = rf.predict(to_predict[feature_cols].values)
                    sum_pred = y_pred.sum()
                    pred_sum = sum_pred

                df_out.loc[idx] = [
                    sum_pred,
                    sum_truth_train,
                    sum_truth_cannot,
                    original_sum,
                    sum_truth_train,
                    sum_truth_cannot,
                    pred_sum,
                ]

# NORTH
process_phase(fc, yr_north, north_groups, fc_monthly_north, site="north")

# SOUTH PRE-FIRE
fc_pre_fire = fc["2011-01-01":"2016-10-13"]
process_phase(fc_pre_fire, yr_pre_fire, south_groups, fc_monthly_south_pre_fire, site="south")

# SOUTH FIRE
fc_fire = fc["2016-10-14":"2017-10-13"]
process_phase(fc_fire, yr_fire, south_groups, fc_monthly_south_fire, site="south")

# SOUTH POST-FIRE
fc_post_fire = fc["2017-10-14":]
process_phase(fc_post_fire, yr_post_fire, south_groups, fc_monthly_south_post_fire, site="south")

# MERGE SOUTH PRE-FIRE, FIRE, AND POST-FIRE

# Pre-fire and fire: Pre-fire row 69 and fire row 9
pre_fire_plus_fire_sum = fc_monthly_south_pre_fire.loc[69,:] + fc_monthly_south_fire.loc[9,:]
pre_fire_plus_fire_sum[['year','month']] = [2016,10]

# Fire and post-fire: Fire row 21 and post-fire row 9
fire_plus_post_fire_sum = fc_monthly_south_fire.loc[21,:] + fc_monthly_south_post_fire.loc[9,:]
fire_plus_post_fire_sum[['year','month']] = [2017,10]

# Merge
fc_monthly_south = pd.concat([fc_monthly_south_pre_fire[68:],pre_fire_plus_fire_sum,fc_monthly_south_fire[10:20],fire_plus_post_fire_sum,fc_monthly_south_post_fire[10:]],axis=0).reset_index(drop=True)

# POLISH
fc_monthly_north['ET'] = ((fc_monthly_north['sum_prediction'] + fc_monthly_north['sum_truth_train'] + fc_monthly_north['sum_truth_cannot_predict'])/
                          (fc_monthly_north['predicted_data_sum'] + fc_monthly_north['truth_train_data_sum'] + fc_monthly_north['truth_cannot_predict_data_sum']))
fc_monthly_south['ET'] = ((fc_monthly_south['sum_prediction'] + fc_monthly_south['sum_truth_train'] + fc_monthly_south['sum_truth_cannot_predict'])/
                          (fc_monthly_south['predicted_data_sum'] + fc_monthly_south['truth_train_data_sum'] + fc_monthly_south['truth_cannot_predict_data_sum']))