In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Data

import numpy as np
import pandas as pd
import xarray as xr
import bottleneck as bn
import iris

# Plotting

import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point

In [2]:
# Define some filepaths that will be used a lot
home_dir = '/home/561/mg5624/RF_project/'
my_data_dir = '/g/data/w97/mg5624/'
shared_data_dir = '/g/data/w97/Shared_data/Observations/'

## Take Necessary Data from Sanaa's Dataframe

In [3]:
# Load in Database from Sanaa's RF model
df = pd.read_csv(home_dir + '/Data/ML_Database_All_AWRA_MOf_and_3MPrecip.csv')

df = df.drop(df.columns[0], axis=1)

df.rename(columns = {'Drought / No Drought': 'Drought'}, inplace=True)
print(df)

     Year     Month Year_Month     Location  Latitude  Longitude  Drought  \
0    2014     March    2014-03        Cobar  -31.4980   145.8383        1   
1    2014  February    2014-02        Cobar  -31.4980   145.8383        1   
2    2014   January    2014-01        Cobar  -31.4980   145.8383        1   
3    2014     March    2014-03      Walgett  -30.0167   148.1167        1   
4    2014  February    2014-02      Walgett  -30.0167   148.1167        1   
..    ...       ...        ...          ...       ...        ...      ...   
930  2020  November    2020-11  Wagga Wagga  -35.1330   147.3670        0   
931  2020  December    2020-12  Wagga Wagga  -35.1330   147.3670        0   
932  2021   January    2021-01  Wagga Wagga  -35.1330   147.3670        0   
933  2021  February    2021-02  Wagga Wagga  -35.1330   147.3670        0   
934  2021     March    2021-03  Wagga Wagga  -35.1330   147.3670        0   

     Deep_Drainage  PET_Actual    E_Actual  Soil_M_root_zone      Qtot  \
0

In [4]:
# Create two databases from it - one with just the drought/no drought colomn and one with the climate drivers
columns_to_drop_for_drought = ['Deep_Drainage', 'PET_Actual', 'E_Actual', 'Soil_M_root_zone', 'Qtot', 'Rainfall', 'ENSO', 'IOD', 'SAM', 'P_acc_3M']
training_df = df.drop(columns_to_drop_for_drought, axis=1)

print(training_df)

training_df.to_csv(home_dir + '/Data/drought_dataframe.csv')

     Year     Month Year_Month     Location  Latitude  Longitude  Drought
0    2014     March    2014-03        Cobar  -31.4980   145.8383        1
1    2014  February    2014-02        Cobar  -31.4980   145.8383        1
2    2014   January    2014-01        Cobar  -31.4980   145.8383        1
3    2014     March    2014-03      Walgett  -30.0167   148.1167        1
4    2014  February    2014-02      Walgett  -30.0167   148.1167        1
..    ...       ...        ...          ...       ...        ...      ...
930  2020  November    2020-11  Wagga Wagga  -35.1330   147.3670        0
931  2020  December    2020-12  Wagga Wagga  -35.1330   147.3670        0
932  2021   January    2021-01  Wagga Wagga  -35.1330   147.3670        0
933  2021  February    2021-02  Wagga Wagga  -35.1330   147.3670        0
934  2021     March    2021-03  Wagga Wagga  -35.1330   147.3670        0

[935 rows x 7 columns]


# Process New Data

In [5]:
# Define functions needed for processing training data
def add_predictor_to_training_df(training_dataframe, predictor_dataset, predictor_name, replace=False):
    """
    Function which adds relevant data to the training dataframe
    Args:
    training_dataframe (pd.DataFrame): dataframe containing all other training data
    predictor_dataset (xr.DataArray): xarray dataset of the predictor variable
    predictor_name (str): full name of the predictor variable
    replace (bool): if True and predictor_name already in df, it is replaced with new one (default=False)
    """
    # Return the original dataframe if predictor_name exists and we're not replacing it.
    if predictor_name in training_dataframe and not replace:
        return training_dataframe

    # Add empty column to training_df of predictor variable
    training_dataframe[predictor_name] = None
    
    # Find the lat, long, and time columns from the dataframe
    training_longs = training_dataframe['Longitude']
    training_lats = training_dataframe['Latitude']
    training_time = training_dataframe['Year_Month']

    # Loop over each entry in training_df and add the corresponding entry from the predictor dataset
    for i, time in enumerate(training_time):
        time_check = pd.to_datetime(time)
        if time_check <= predictor_dataset.coords['time'][-1].data:
            predictor_at_i = predictor_dataset.sel(time=time).sel(lon=training_longs[i], lat=training_lats[i], method='nearest').data
            training_dataframe[predictor_name].iat[i] = predictor_at_i
        else:
            break

    return training_dataframe
            

def rename_coord_titles_to_lat_long(dataset):
    """
    Changes the titles of the coordinates to lat long to keep it consistent
    Args:
    dataset (xr.DataSet): dataset with incorrect coordinate titles
    """
    # Define mapping from old to new name

    mapping = {
        'X': 'lon',
        'Y': 'lat'
    }
    
    renamed_dataset = dataset.rename(mapping)

    return renamed_dataset

## Precipitation

Using AGCD precipitation data which runs from 1900 to 2023 across the whole of Australia. Resolution is at 0.05 degrees.

In [6]:
# Load AGCD precip data
precip_ds = xr.open_dataset('/g/data/w97/amu561/AGCD_drought_metrics/AGCD_1900_2021/AGCD_v1_precip_total_r005_monthly_1900_2021.nc')
precip = precip_ds.precip

precip_filepath = my_data_dir + '/RF_project/Precipitation/AGCD/'
precip_3months = xr.open_dataarray(precip_filepath + 'AGCD_v1_precip_total_r005_3monthly_1900_2021.nc')
precip_annual = xr.open_dataarray(precip_filepath + 'AGCD_v1_precip_total_r005_annual_1900_2021.nc')

training_df = add_predictor_to_training_df(training_df, precip, 'Precipitation')
training_df = add_predictor_to_training_df(training_df, precip_3months, 'Acc_3-Month_Precipitation')
training_df = add_predictor_to_training_df(training_df, precip_annual, 'Acc_Annual_Precipitation')

## Runoff

In [7]:
runoff = xr.open_dataarray('/g/data/w97/mg5624/RF_project/Runoff/AWRA/AWRAv7_Runoff_month_1911_2023.nc')
training_df = add_predictor_to_training_df(training_df, runoff, 'Runoff')

## Climate Drivers: ENSO, IOD, SAM

In [8]:
def sort_tabled_dataframe_into_correct_form(dataframe, driver):
    """
    Takes in a dataframe with years down column 1 and months along row 1 and sorts it so that
    the columns are "Year", "Month", "Year_Month", index.
    Args:
    dataframe (pd.DataFrame): Dataframe of the dirvers index in tabular form
    driver (str): name of the driver
    """    
    original_cols = dataframe.columns
    if isinstance(original_cols[1], int) or original_cols[1] == '1':
        # Transform dataframe so that it has a month column instead of months on the rows
        melted_df = dataframe.melt(id_vars=['Year'], var_name='Month', value_name=f'{driver}_index')
        dataframe = melted_df
    else:    
        dataframe.rename(columns={'Index': f'{driver}_index'}, inplace=True)

    # Ensure Months are saved as 1 digit strings
    dataframe['Month'] = dataframe['Month'].astype(int).astype(str)
    
    # Create new "Year_Month" column
    dataframe["Year_Month"] = dataframe['Year'].astype(str) + '-' + dataframe['Month'].str.zfill(2)

    # Swap "Year_Month" column with index column
    new_cols = list(dataframe.columns)
    new_cols[3], new_cols[2] = new_cols[2], new_cols[3]
    switch_cols_df = dataframe[new_cols]

    # Remove Year and Month columns
    final_df = switch_cols_df[['Year_Month', f'{driver}_index']]
    return final_df


def add_drivers_df_to_training_df(training_df, driver):
    """
    Adds the data from the drivers index to the training dataframe.
    Args:
    training_df (pd.DataFrame): Dataframe containing the training data
    driver (str): name of the driver
    """
    # Define dictionary of driver to index name
    index_name = {'ENSO': 'BEST', 'IOD': 'DMI', 'SAM': 'AAO'}
    
    # Sort the dataframe into correct format
    df = pd.read_csv(my_data_dir + f'RF_project/{driver}/{driver}_{index_name[driver]}_index.csv')

    sorted_df = sort_tabled_dataframe_into_correct_form(df, driver)
    
    # Merge driver dataframe into training dataframe
    merged_df = pd.merge(training_df, sorted_df, on='Year_Month', how='inner')
    
    return merged_df

In [9]:
drivers = ['ENSO', 'IOD', 'SAM']
index_name = {'ENSO': 'BEST', 'IOD': 'DMI', 'SAM': 'AAO'}

for driver in drivers:    
    # Add the drivers to the training dataframe if they're not in there yet
    if not f'{driver}_index' in training_df.columns:
        training_df = add_drivers_df_to_training_df(training_df, driver)

    # Save the full drivers dataframes to my data dir
    df = pd.read_csv(my_data_dir + f'RF_project/{driver}/{driver}_{index_name[driver]}_index.csv')
    sorted_df = sort_tabled_dataframe_into_correct_form(df, driver)
    sorted_df.to_csv(my_data_dir + f'RF_project/{driver}/{driver}_{index_name[driver]}_index_sorted.csv')

## Evapotranspiration and Potential Evapotranspiration

In [10]:
GLEAM_data_path = my_data_dir + f'RF_project/ET_products/v3_6/'


ET = xr.open_dataarray(GLEAM_data_path + 'ET/ET_1980-2021_GLEAM_v3.6a_MO_Australia_0.05grid.nc')
PET = xr.open_dataarray(GLEAM_data_path + 'PET/PET_1980-2021_GLEAM_v3.6a_MO_Australia_0.05grid.nc')

training_df = add_predictor_to_training_df(training_df, ET, 'ET')
training_df = add_predictor_to_training_df(training_df, PET, 'PET')

## Soil Moisture

In [11]:
SM_path = my_data_dir + 'RF_project/Soil_Moisture/v3_8/'
SM_vars = ['SMsurf', 'SMroot']

for var in SM_vars:
    sm_dataset = xr.open_dataset(SM_path + f'{var}/{var}_1980-2022_GLEAM_v3.8a_MO_Australia_0.05grid.nc')

    if var == 'SMsurf':
        sm_dataarray = sm_dataset.SMsurf
    else:
        sm_dataarray = sm_dataset.SMroot
 
    training_df = add_predictor_to_training_df(training_df, sm_dataarray, var)

## Change in Water Storage

In [12]:
# CWS = xr.open_dataarray(my_data_dir + '/RF_project/Water_Storage/CWS_v03_JPL_MSWEP_monthly_ensemble_mean_Australia_0.05grid.nc')
# training_df = add_predictor_to_training_df(training_df, CWS, 'CWS')

In [13]:
# print(training_df)

## Months

In [14]:
def add_cyclical_month_columns_to_training_df(training_dataframe):
    """
    Adds two extra columns to training dataframe (sine_month and cosine_month) to proved cyclical months.
    Args:
    training_dataframe (pd.DataFrame): Dataframe containing the training data
    """
    months  = [
        'January', 'February', 'March', 'April', 'May', 'June', 
        'July', 'August', 'September', 'October', 'November', 'December'
    ]

    month_numbers = np.arange(1, 13)

    angles = 2 * np.pi * month_numbers / 12
    sin_month = np.sin(angles)
    cos_month = np.cos(angles)

    month_data = {'Month': months, 'Sin_month': sin_month, 'Cos_month': cos_month}
    month_df = pd.DataFrame(month_data)
    
    merged_df = pd.merge(training_dataframe, month_df, on='Month', how='inner')

    return merged_df
    

training_df = add_cyclical_month_columns_to_training_df(training_df)

## Save Training Dataframe

In [15]:
training_df.to_csv(my_data_dir + f'RF_project/training_data/training_data.csv')
print(training_df)

     Year      Month Year_Month     Location  Latitude  Longitude  Drought  \
0    2014      March    2014-03        Cobar  -31.4980   145.8383        1   
1    2014      March    2014-03      Walgett  -30.0167   148.1167        1   
2    2014      March    2014-03    Coonamble  -30.9500   148.4000        1   
3    2014      March    2014-03       Warren  -31.7000   147.8330        1   
4    2014      March    2014-03       Nyngan  -31.5630   147.1962        1   
..    ...        ...        ...          ...       ...        ...      ...   
930  2020  September    2020-09     Bermagui  -36.4167   150.0667        0   
931  2020  September    2020-09         Bega  -36.6742   149.8429        0   
932  2020  September    2020-09       Casino  -28.8667   153.0500        0   
933  2020  September    2020-09      Bonalbo  -28.7372   152.6220        0   
934  2020  September    2020-09  Wagga Wagga  -35.1330   147.3670        0   

    Precipitation Acc_3-Month_Precipitation Acc_Annual_Precipit