# Imports

In [None]:
import re
import gc
import geopandas

import pandas as pd
import numpy as np
import bamboolib as bam
import netCDF4 as nc
import xarray as xr

import plotly.express as px
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.animation as anim 
import plotly.graph_objects as go 

from matplotlib import transforms
from cartopy import config
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from tqdm.auto import tqdm
from glob import glob
from json import load, dumps
from os import makedirs, path

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# Misc

Tahiti

- Actual coords: 17.6509° S, 149.4260° W
- Closest in the global simulation: -17.75    210.00

Darwin

- Actual coords: 12.4637° S, 130.8444° E
- Closest in the global simulation: -12.50   -130.00

# Explore doing this with a single file

In [None]:
ds_global = xr.load_dataset("/Users/tomm/projects/ALL-Global_MSLP/item16222_monthly_mean_v000_1985-12_1986-11.nc")

df_global = ds_global.to_dataframe()
df_global = df_global.reset_index()

df_global

In [None]:
df_temp = df_global.copy()
df_temp["year"] = df_temp["time2"].astype(str).str[:4]
df_temp["date"] = df_temp["time2"].astype(str).str[:10]
# df_temp = df_temp.loc[df_temp["year"] == "1985"]
df_temp

In [None]:
fig = px.scatter(
    df_temp, 
    x='longitude0', 
    y='latitude0', 
    color='item16222_monthly_mean', 
    width=1000,
    height=700,
    animation_frame="date"
)
fig.add_shape(
    x0=209.5,
    x1=210.5,
    y0=-17.0,
    y1=-18.0,
    name="Tahiti"
)
fig.add_shape(
    x0=129.5,
    x1=130.5,
    y0=-12.0,
    y1=-13.0,
    name="Darwin"
)
fig

In [None]:
df_important = df_global.loc[
    (
        (df_global["longitude0"]==210) & (df_global["latitude0"]==-17.5)
    ) | (
        (df_global["longitude0"]==131.25) & (df_global["latitude0"]==-12.5)
    )
]

df_important.reset_index(drop=True, inplace=True)

df_important.loc[df_important["latitude0"] == -17.5, "location"] = "Tahiti"
df_important.loc[df_important["latitude0"] == -12.5, "location"] = "Darwin"

df_important

In [None]:
target_col = "item16222_monthly_mean"

df_tahiti = df_important.loc[df_important["location"] == "Tahiti"]
df_darwin = df_important.loc[df_important["location"] == "Darwin"]

tahiti_mean = df_tahiti[target_col].mean()
tahiti_std = df_tahiti[target_col].std()

darwin_mean = df_darwin[target_col].mean()
darwin_std = df_darwin[target_col].std()

df_tahiti["standardised"] = (df_tahiti[target_col] - tahiti_mean) / tahiti_std
df_darwin["standardised"] = (df_darwin[target_col] - darwin_mean) / darwin_std

In [None]:
df_tahiti["time2"] = df_tahiti["time2"].astype(str)
df_tahiti['time2'] = pd.to_datetime(df_tahiti['time2'], format='%Y-%m-%d %H:%M:%S')
df_tahiti["month"] = df_tahiti["time2"].dt.month.astype(str).str.rjust(2, "0")
df_tahiti["year"] = df_tahiti["time2"].dt.year.astype(str)

df_tahiti["year-month"] = df_tahiti["year"] + "-" + df_tahiti["month"]

df_tahiti

In [None]:
results = {}

darwin_values = df_darwin["standardised"].values
tahiti_values = df_tahiti["standardised"].values

darwin_raw = df_darwin[target_col].values
tahiti_raw = df_tahiti[target_col].values

both_std = df_important[target_col].std()

for index, year_month in enumerate(df_tahiti["year-month"].values):
    results[year_month] = round(tahiti_values[index] - darwin_values[index], 1)
    # results[year_month] = round((tahiti_raw[index] - darwin_raw[index]) / both_std, 1)

In [None]:
results

# Try doing it for all years of a single model

In [None]:
target_col = "item16222_monthly_mean"

dfs = []

for file_path in tqdm(sorted(glob("/Users/tomm/projects/ALL-Global_MSLP/item16222_monthly_mean_v000_*.nc"))):
    ds_global = xr.load_dataset(file_path)

    df_global = ds_global.to_dataframe()
    df_global = df_global.reset_index()
    
    df_important = df_global.loc[
        (
            (df_global["longitude0"]==210) & (df_global["latitude0"]==-17.5)
        ) | (
            (df_global["longitude0"]==131.25) & (df_global["latitude0"]==-12.5)
        )
    ]

    df_important.reset_index(drop=True, inplace=True)

    df_important.loc[df_important["latitude0"] == -17.5, "location"] = "Tahiti"
    df_important.loc[df_important["latitude0"] == -12.5, "location"] = "Darwin"
    
    df_important["time2"] = df_important["time2"].astype(str)
    df_important['time2'] = pd.to_datetime(df_important['time2'], format='%Y-%m-%d %H:%M:%S')
    df_important["month"] = df_important["time2"].dt.month.astype(str).str.rjust(2, "0")
    df_important["year"] = df_important["time2"].dt.year.astype(str)

    df_important["year_month"] = df_important["year"] + "-" + df_important["month"]
    
    df = pd.DataFrame(
        data={
            "year": df_important.loc[df_important["location"] == "Tahiti"]["year"].values,
            "month": df_important["month"].unique(),
            "year_month": df_important["year_month"].unique(),
            "mslp_tahiti": df_important.loc[df_important["location"] == "Tahiti"][target_col].reset_index(drop=True),
            "mslp_darwin": df_important.loc[df_important["location"] == "Darwin"][target_col].reset_index(drop=True)
        },
        index=np.arange(0, 12)
    )
    
    tahiti_mean = df["mslp_tahiti"].mean()
    tahiti_std = df["mslp_tahiti"].std()
    
    darwin_mean = df["mslp_darwin"].mean()
    darwin_std = df["mslp_darwin"].std()
    
    df["standard_mslp_tahiti"] = ((df["mslp_tahiti"] - tahiti_mean) / tahiti_std).astype(float)
    df["standard_mslp_darwin"] = ((df["mslp_darwin"] - darwin_mean) / darwin_std).astype(float)
    
    df["soi"] = (df["standard_mslp_tahiti"] - df["standard_mslp_darwin"]).astype(float)
    
    df = df.round(1)
    
    dfs.append(df)

df = pd.concat(dfs)

In [None]:
df

In [None]:
from os import makedirs


parent_dir = "../wah_soi/sim=ALL/model_tag=v000"
makedirs(parent_dir, exist_ok=True)

df.to_parquet(f"{parent_dir}/data.parquet")

# Try doing it for all models all years

In [None]:
def generate_parent_folder(model_file, method_name, sim_type, string_to_replace):
    model_tag = re.findall(f"mean_(.*?)_{string_to_replace[:4]}", model_file)[0]
    parent_dir = f"../wah_soi/method={method_name}/sim={sim_type}/model_tag={model_tag}"
    makedirs(parent_dir, exist_ok=True)
    model_file = model_file.replace(string_to_replace, "*")
    files = sorted(glob(model_file))
    return files, parent_dir


def read_and_extract(file_path):
    TARGET_COL = "item16222_monthly_mean"
    
    ds_global = xr.load_dataset(file_path)
    df_global = ds_global.to_dataframe()
    df_global = df_global.reset_index()

    df_important = df_global.loc[
        (
            (df_global["longitude0"]==210) & (df_global["latitude0"]==-17.5)
        ) | (
            (df_global["longitude0"]==131.25) & (df_global["latitude0"]==-12.5)
        )
    ]

    df_important.reset_index(drop=True, inplace=True)

    df_important.loc[df_important["latitude0"] == -17.5, "location"] = "Tahiti"
    df_important.loc[df_important["latitude0"] == -12.5, "location"] = "Darwin"

    df_important["time2"] = df_important["time2"].astype(str)
    df_important['time2'] = pd.to_datetime(df_important['time2'], format='%Y-%m-%d %H:%M:%S')
    df_important["month"] = df_important["time2"].dt.month.astype(str).str.rjust(2, "0")
    df_important["year"] = df_important["time2"].dt.year.astype(str)
    df_important["year_month"] = df_important["year"] + "-" + df_important["month"]

    df = pd.DataFrame(
        data={
            "year": df_important.loc[df_important["location"] == "Tahiti"]["year"].values,
            "month": df_important["month"].unique(),
            "year_month": df_important["year_month"].unique(),
            "mslp_tahiti": df_important.loc[df_important["location"] == "Tahiti"][TARGET_COL].reset_index(drop=True),
            "mslp_darwin": df_important.loc[df_important["location"] == "Darwin"][TARGET_COL].reset_index(drop=True)
        }
    )
    return df


def generate_soi_for_model_by_rolling(model_file, sim_type, string_to_replace, window_size):
    files, parent_dir = generate_parent_folder(model_file, f"roll_{str(window_size).rjust(2, '0')}", sim_type, string_to_replace)
    if path.exists(f"{parent_dir}/data.parquet"):
        return files

    df = pd.concat([read_and_extract(file_path) for file_path in files])

    df["mslp_tahiti_mean"] = df["mslp_tahiti"].rolling(window_size).mean()
    df["mslp_tahiti_std"] = df["mslp_tahiti"].rolling(window_size).std()

    df["mslp_darwin_mean"] = df["mslp_darwin"].rolling(window_size).mean()
    df["mslp_darwin_std"] = df["mslp_darwin"].rolling(window_size).std()

    df["standardised_mslp_tahiti"] = ((df["mslp_tahiti"] - df["mslp_tahiti_mean"]) / df["mslp_tahiti_std"]).astype(float)
    df["standardised_mslp_darwin"] = ((df["mslp_darwin"] - df["mslp_darwin_mean"]) / df["mslp_darwin_std"]).astype(float)

    df["soi"] = (df["standardised_mslp_tahiti"] - df["standardised_mslp_darwin"]).astype(float)

    df = df.round(1)
    df = df.reset_index(drop=True)

    df.to_parquet(f"{parent_dir}/data.parquet")

    return files


def generate_soi_for_model_by_similar_months(model_file, sim_type, string_to_replace):
    files, parent_dir = generate_parent_folder(model_file, "month", sim_type, string_to_replace)
    if path.exists(f"{parent_dir}/data.parquet"):
        return files

    df = pd.concat([read_and_extract(file_path) for file_path in files])

    mslp_tahiti_month_means = dict()
    mslp_tahiti_month_stds = dict()
    mslp_darwin_month_means = dict()
    mslp_darwin_month_stds = dict()

    for month in df["month"].unique():
        df_month = df.loc[df["month"] == month]
        mslp_tahiti_month_means[month] = df_month["mslp_tahiti"].mean()
        mslp_tahiti_month_stds[month] = df_month["mslp_tahiti"].std()
        mslp_darwin_month_means[month] = df_month["mslp_darwin"].mean()
        mslp_darwin_month_stds[month] = df_month["mslp_darwin"].std()

    df["mslp_tahiti_mean"] = df["month"].map(mslp_tahiti_month_means)
    df["mslp_tahiti_std"] = df["month"].map(mslp_tahiti_month_stds)

    df["mslp_darwin_mean"] = df["month"].map(mslp_darwin_month_means)
    df["mslp_darwin_std"] = df["month"].map(mslp_darwin_month_stds)

    df["standardised_mslp_tahiti"] = ((df["mslp_tahiti"] - df["mslp_tahiti_mean"]) / df["mslp_tahiti_std"]).astype(float)
    df["standardised_mslp_darwin"] = ((df["mslp_darwin"] - df["mslp_darwin_mean"]) / df["mslp_darwin_std"]).astype(float)

    df["soi"] = (df["standardised_mslp_tahiti"] - df["standardised_mslp_darwin"]).astype(float)

    df = df.round(1)
    df = df.reset_index(drop=True)

    df.to_parquet(f"{parent_dir}/data.parquet")

    return files


def generate_soi_for_model_by_reference(model_file, method_name, sim_type, string_to_replace, reference):
    files, parent_dir = generate_parent_folder(model_file, method_name, sim_type, string_to_replace)
    if path.exists(f"{parent_dir}/data.parquet"):
        return files

    df = pd.concat([read_and_extract(file_path) for file_path in files])

    ref_tahiti_mean = reference["tahiti_mean"]
    ref_tahiti_std = reference["tahiti_std"]
    ref_darwin_mean = reference["darwin_mean"]
    ref_darwin_std = reference["darwin_std"]

    df["standardised_mslp_tahiti"] = ((df["mslp_tahiti"] - ref_tahiti_mean) / ref_tahiti_std).astype(float)
    df["standardised_mslp_darwin"] = ((df["mslp_darwin"] - ref_darwin_mean) / ref_darwin_std).astype(float)

    df["soi"] = (df["standardised_mslp_tahiti"] - df["standardised_mslp_darwin"]).astype(float)

    df = df.round(1)
    df = df.reset_index(drop=True)

    df.to_parquet(f"{parent_dir}/data.parquet")

    return files

## Generate Reference dataframes

In [None]:
df_real_world_reference = pd.read_csv("../Data/MSLP_CRU.csv")
df_real_world_reference = df_real_world_reference.loc[df_real_world_reference["year"].between(1984, 2014)]

# Weather at home uses slightly different units, adjust CRU accordingly
df_real_world_reference["mslp_tahiti"] *= 10
df_real_world_reference["mslp_darwin"] *= 10

reference_real_world = {
    "tahiti_mean": df_real_world_reference["mslp_tahiti"].mean(),
    "tahiti_std": df_real_world_reference["mslp_tahiti"].std(),
    "darwin_mean": df_real_world_reference["mslp_darwin"].mean(),
    "darwin_std": df_real_world_reference["mslp_darwin"].std()
}

In [None]:
files_ALL = sorted(glob(f"/Users/tomm/projects/ALL-Global_MSLP/item16222_monthly_mean_*_*.nc"))

dfs = []
for file_path in tqdm(files_ALL):
    dfs.append(read_and_extract(file_path))
df_ALL_world_reference = pd.concat(dfs)

reference_ALL_world = {
    "tahiti_mean": df_ALL_world_reference["mslp_tahiti"].mean(),
    "tahiti_std": df_ALL_world_reference["mslp_tahiti"].std(),
    "darwin_mean": df_ALL_world_reference["mslp_darwin"].mean(),
    "darwin_std": df_ALL_world_reference["mslp_darwin"].std()
}

In [None]:
files_NAT = sorted(glob(f"/Users/tomm/projects/NAT-Global_MSLP/item16222_monthly_mean_*_*.nc"))

dfs = []
for file_path in tqdm(files_NAT):
    dfs.append(read_and_extract(file_path))
df_NAT_world_reference = pd.concat(dfs)

reference_NAT_world = {
    "tahiti_mean": df_NAT_world_reference["mslp_tahiti"].mean(),
    "tahiti_std": df_NAT_world_reference["mslp_tahiti"].std(),
    "darwin_mean": df_NAT_world_reference["mslp_darwin"].mean(),
    "darwin_std": df_NAT_world_reference["mslp_darwin"].std()
}

In [None]:
# Clear from mem
dfs = None
df_real_world_reference = None
df_ALL_world_reference = None
df_NAT_world_reference = None

## Generate for everything - ALL

In [None]:
files = []

string_to_replace = "1988-12_1989-11"

sim_type = "ALL"

for model_file in tqdm(sorted(glob(f"/Users/tomm/projects/{sim_type}-Global_MSLP/item16222_monthly_mean_*_{string_to_replace}.nc")), leave=1, desc="Model Loop"):
    files = files + generate_soi_for_model_by_rolling(model_file, sim_type, string_to_replace, 12)
    files = files + generate_soi_for_model_by_rolling(model_file, sim_type, string_to_replace, 6)
    files = files + generate_soi_for_model_by_rolling(model_file, sim_type, string_to_replace, 4)
    files = files + generate_soi_for_model_by_similar_months(model_file, sim_type, string_to_replace)
    files = files + generate_soi_for_model_by_reference(model_file, "ref_model_world", sim_type, string_to_replace, reference_ALL_world)
    files = files + generate_soi_for_model_by_reference(model_file, "ref_real_world", sim_type, string_to_replace, reference_real_world)
    
print(f"Found {len(set(files)):,} files")

real_files = sorted(glob(f"/Users/tomm/projects/{sim_type}-Global_MSLP/item*.nc"))
print(f"Found {len(real_files):,} files")

In [None]:
missing = sorted(list(set(real_files) - set(files)))

missing_dict = {}
for item in missing:
    key = item[60:64]
    if key not in missing_dict:
        missing_dict[key] = item

missing_dict

In [None]:
## It always manages to miss some so just run those manually

for file in tqdm(missing_dict.values()):
    string_to_replace = file[65:80]
    generate_soi_for_model_by_rolling(file, sim_type, string_to_replace, 12)
    generate_soi_for_model_by_rolling(file, sim_type, string_to_replace, 6)
    generate_soi_for_model_by_rolling(file, sim_type, string_to_replace, 4)
    generate_soi_for_model_by_similar_months(file, sim_type, string_to_replace)
    generate_soi_for_model_by_reference(file, "ref_model_world", sim_type, string_to_replace, reference_ALL_world)
    generate_soi_for_model_by_reference(file, "ref_real_world", sim_type, string_to_replace, reference_real_world)

## Generate for everything - NAT

In [None]:
files = []

string_to_replace = "1988-12_1989-11"

sim_type = "NAT"

for model_file in tqdm(sorted(glob(f"/Users/tomm/projects/{sim_type}-Global_MSLP/item16222_monthly_mean_*_{string_to_replace}.nc")), leave=1, desc="Model Loop"):
    files = files + generate_soi_for_model_by_rolling(model_file, sim_type, string_to_replace, 12)
    files = files + generate_soi_for_model_by_rolling(model_file, sim_type, string_to_replace, 6)
    files = files + generate_soi_for_model_by_rolling(model_file, sim_type, string_to_replace, 4)
    files = files + generate_soi_for_model_by_similar_months(model_file, sim_type, string_to_replace)
    files = files + generate_soi_for_model_by_reference(model_file, "ref_model_world", sim_type, string_to_replace, reference_NAT_world)
    files = files + generate_soi_for_model_by_reference(model_file, "ref_real_world", sim_type, string_to_replace, reference_real_world)
    
print(f"Found {len(set(files)):,} files")

real_files = sorted(glob(f"/Users/tomm/projects/{sim_type}-Global_MSLP/item*.nc"))
print(f"Found {len(real_files):,} files")

In [None]:
missing = sorted(list(set(real_files) - set(files)))

missing_dict = {}
for item in missing:
    key = item[60:64]
    if key not in missing_dict:
        missing_dict[key] = item

missing_dict

In [None]:
for file in tqdm(missing_dict.values()):
    string_to_replace = file[65:80]
    generate_soi_for_model_by_rolling(file, sim_type, string_to_replace, 12)
    generate_soi_for_model_by_rolling(file, sim_type, string_to_replace, 6)
    generate_soi_for_model_by_rolling(file, sim_type, string_to_replace, 4)
    generate_soi_for_model_by_similar_months(file, sim_type, string_to_replace)
    generate_soi_for_model_by_reference(file, "ref_model_world", sim_type, string_to_replace, reference_NAT_world)
    generate_soi_for_model_by_reference(file, "ref_real_world", sim_type, string_to_replace, reference_real_world)

# Compare the different methods to each other

In [None]:
def group_by_year_month(df):
    group_cols = [
        "year_month",
        "sim"
    ]

    df_grouped = df[group_cols + ["soi"]].groupby(group_cols).mean()
    df_grouped = df_grouped.reset_index()
    df_grouped["soi"] = df_grouped["soi"].astype(float).round(1)
    
    return df_grouped

In [None]:
df_roll_12 = pd.read_parquet("../wah_soi/method=roll_12/")
df_roll_06 = pd.read_parquet("../wah_soi/method=roll_06/")
df_roll_04 = pd.read_parquet("../wah_soi/method=roll_04/")
df_month = pd.read_parquet("../wah_soi/method=month/")
df_ref_model = pd.read_parquet("../wah_soi/method=ref_model_world/")
df_ref_real = pd.read_parquet("../wah_soi/method=ref_real_world/")
df_ref_real["soi"] += 4.89

df_grouped_roll_12 = group_by_year_month(df_roll_12)
df_grouped_roll_06 = group_by_year_month(df_roll_06)
df_grouped_roll_04 = group_by_year_month(df_roll_04)
df_grouped_month = group_by_year_month(df_month)
df_grouped_model = group_by_year_month(df_ref_model)
df_grouped_real = group_by_year_month(df_ref_real)

In [None]:
px.bar(df_grouped_month.dropna(subset=['soi']), x='year_month', y='soi', color='sim', barmode="overlay")

In [None]:
px.bar(df_grouped_roll_12.dropna(subset=['soi']), x='year_month', y='soi', color='sim', barmode="overlay")

In [None]:
px.bar(df_grouped_roll_06.dropna(subset=['soi']), x='year_month', y='soi', color='sim', barmode="overlay")

In [None]:
px.bar(df_grouped_roll_04.dropna(subset=['soi']), x='year_month', y='soi', color='sim', barmode="overlay")

In [None]:
px.bar(df_grouped_model.dropna(subset=['soi']), x='year_month', y='soi', color='sim', barmode="overlay")

In [None]:
px.bar(df_grouped_real.dropna(subset=['soi']), x='year_month', y='soi', color='sim', barmode="overlay")

### Is the world becoming "more Nino" as time goes on?

In [None]:
window_size = 12

values_ALL = df_grouped_model.loc[df_grouped_model["sim"] == "ALL"]["soi"].rolling(window_size).mean().round(1)
values_NAT = df_grouped_model.loc[df_grouped_model["sim"] == "NAT"]["soi"].rolling(window_size).mean().round(1)

fig = go.Figure()

x_vals = list(df_grouped_model["year_month"].unique())[window_size:]

fig.add_trace(
    go.Scatter(
        x=x_vals,
        y=values_ALL,
        name="ALL",
        # mode="markers"
    )
)

fig.add_trace(
    go.Scatter(
        x=x_vals,
        y=values_NAT,
        name="NAT",
        # mode="markers"
    )
)

fig.update_layout(
    hovermode="x",
    height=500
)

# Sanity check your calculations on CRU observed MSLP data

## build a nicer csv from their data

In [None]:
## CRU

df_tahiti = pd.read_csv("../Data/raw_CRU_MSLP_tahiti.csv", delimiter=" ")
df_darwin = pd.read_csv("../Data/raw_CRU_MSLP_darwin.csv", delimiter=" ")

df = pd.DataFrame(data={
    "year": [year for _ in range(12) for year in df_tahiti["year"].unique()],
    "month": [str(month).rjust(2, '0') for month in range(1, 13) for year in df_tahiti["year"].unique()],
    "mslp_tahiti": list(df_tahiti["01"].values) + 
        list(df_tahiti["02"].values) +
        list(df_tahiti["03"].values) +
        list(df_tahiti["04"].values) +
        list(df_tahiti["05"].values) +
        list(df_tahiti["06"].values) +
        list(df_tahiti["07"].values) +
        list(df_tahiti["08"].values) +
        list(df_tahiti["09"].values) +
        list(df_tahiti["10"].values) +
        list(df_tahiti["11"].values) +
        list(df_tahiti["12"].values),
    "mslp_darwin": list(df_darwin["01"].values) + 
        list(df_darwin["02"].values) +
        list(df_darwin["03"].values) +
        list(df_darwin["04"].values) +
        list(df_darwin["05"].values) +
        list(df_darwin["06"].values) +
        list(df_darwin["07"].values) +
        list(df_darwin["08"].values) +
        list(df_darwin["09"].values) +
        list(df_darwin["10"].values) +
        list(df_darwin["11"].values) +
        list(df_darwin["12"].values)
})

df["year_month"] = df["year"].astype(str) + "_" + df["month"].astype(str)
df = df.sort_values("year_month").reset_index(drop=True)
df = df.drop(columns=["year_month"])
df.to_csv("../Data/MSLP_CRU.csv", index=False)


df_soi = pd.read_csv("../Data/raw_CRU_SOI.csv", delimiter=" ")

df = pd.DataFrame(data={
    "year": [year for _ in range(12) for year in df_soi["year"].unique()],
    "month": [str(month).rjust(2, '0') for month in range(1, 13) for year in df_soi["year"].unique()],
    "soi": list(df_soi["01"].values) + 
        list(df_soi["02"].values) +
        list(df_soi["03"].values) +
        list(df_soi["04"].values) +
        list(df_soi["05"].values) +
        list(df_soi["06"].values) +
        list(df_soi["07"].values) +
        list(df_soi["08"].values) +
        list(df_soi["09"].values) +
        list(df_soi["10"].values) +
        list(df_soi["11"].values) +
        list(df_soi["12"].values)
})

df["year_month"] = df["year"].astype(str) + "_" + df["month"].astype(str)
df = df.sort_values("year_month").reset_index(drop=True)
df = df.drop(columns=["year_month"])
df.to_csv("../Data/SOI_CRU.csv", index=False)

In [None]:
## NOAA

df_anomaly = pd.read_csv("../Data/raw_NOAA_MSLP_anomaly.csv", delimiter=" ")
df_standardised = pd.read_csv("../Data/raw_NOAA_MSLP_standardised.csv", delimiter=" ")

df = pd.DataFrame(data={
    "year": [year for _ in range(12) for year in df_anomaly["year"].unique()],
    "month": [str(month).rjust(2, '0') for month in range(1, 13) for year in df_anomaly["year"].unique()],
    "mslp_anomaly": list(df_anomaly["01"].values) + 
        list(df_anomaly["02"].values) +
        list(df_anomaly["03"].values) +
        list(df_anomaly["04"].values) +
        list(df_anomaly["05"].values) +
        list(df_anomaly["06"].values) +
        list(df_anomaly["07"].values) +
        list(df_anomaly["08"].values) +
        list(df_anomaly["09"].values) +
        list(df_anomaly["10"].values) +
        list(df_anomaly["11"].values) +
        list(df_anomaly["12"].values),
    "mslp_standardised": list(df_standardised["01"].values) + 
        list(df_standardised["02"].values) +
        list(df_standardised["03"].values) +
        list(df_standardised["04"].values) +
        list(df_standardised["05"].values) +
        list(df_standardised["06"].values) +
        list(df_standardised["07"].values) +
        list(df_standardised["08"].values) +
        list(df_standardised["09"].values) +
        list(df_standardised["10"].values) +
        list(df_standardised["11"].values) +
        list(df_standardised["12"].values)
})

# df["soi"] = (df["mslp_anomaly"] - df["mslp_standardised"]).astype(float).round(1)
df["soi"] = df["mslp_standardised"].astype(float).round(1)

df["year_month"] = df["year"].astype(str) + "_" + df["month"].astype(str)
df = df.sort_values("year_month").reset_index(drop=True)
df = df.drop(columns=["year_month"])

df.to_csv("../Data/SOI_NOAA.csv", index=False)

## Compute

In [None]:
def generate_soi_by_similar_month(df, df_reference):
    mslp_tahiti_month_means = dict()
    mslp_tahiti_month_stds = dict()
    mslp_darwin_month_means = dict()
    mslp_darwin_month_stds = dict()
    
    for month in df["month"].unique():
        df_month = df_reference.loc[df_reference["month"] == month]
        mslp_tahiti_month_means[month] = df_month["mslp_tahiti"].mean()
        mslp_tahiti_month_stds[month] = df_month["mslp_tahiti"].std()
        mslp_darwin_month_means[month] = df_month["mslp_darwin"].mean()
        mslp_darwin_month_stds[month] = df_month["mslp_darwin"].std()

    df["mslp_tahiti_mean"] = df["month"].map(mslp_tahiti_month_means)
    df["mslp_tahiti_std"] = df["month"].map(mslp_tahiti_month_stds)

    df["mslp_darwin_mean"] = df["month"].map(mslp_darwin_month_means)
    df["mslp_darwin_std"] = df["month"].map(mslp_darwin_month_stds)

    df["standardised_mslp_tahiti"] = ((df["mslp_tahiti"] - df["mslp_tahiti_mean"]) / df["mslp_tahiti_std"]).astype(float)
    df["standardised_mslp_darwin"] = ((df["mslp_darwin"] - df["mslp_darwin_mean"]) / df["mslp_darwin_std"]).astype(float)

    return (df["standardised_mslp_tahiti"] - df["standardised_mslp_darwin"]).astype(float).round(1)


def generate_soi_by_everything(df, df_reference):
    mslp_tahiti_mean = df_reference["mslp_tahiti"].mean()
    mslp_tahiti_std = df_reference["mslp_tahiti"].std()

    mslp_darwin_mean = df_reference["mslp_darwin"].mean()
    mslp_darwin_std = df_reference["mslp_darwin"].std()

    df["standardised_mslp_tahiti"] = ((df["mslp_tahiti"] - mslp_tahiti_mean) / mslp_tahiti_std).astype(float)
    df["standardised_mslp_darwin"] = ((df["mslp_darwin"] - mslp_darwin_mean) / mslp_darwin_std).astype(float)

    return (df["standardised_mslp_tahiti"] - df["standardised_mslp_darwin"]).astype(float).round(1)


def generate_soi_by_rolling(df, df_reference, window_size):
    df["mslp_tahiti_mean"] = df["mslp_tahiti"].rolling(window_size).mean()
    df["mslp_tahiti_std"] = df["mslp_tahiti"].rolling(window_size).std()

    df["mslp_darwin_mean"] = df["mslp_darwin"].rolling(window_size).mean()
    df["mslp_darwin_std"] = df["mslp_darwin"].rolling(window_size).std()

    df["standardised_mslp_tahiti"] = ((df["mslp_tahiti"] - df["mslp_tahiti_mean"]) / df["mslp_tahiti_std"]).astype(float)
    df["standardised_mslp_darwin"] = ((df["mslp_darwin"] - df["mslp_darwin_mean"]) / df["mslp_darwin_std"]).astype(float)

    return (df["standardised_mslp_tahiti"] - df["standardised_mslp_darwin"]).astype(float).round(1)



def generate_soi_by_difference(df, df_reference):
    df["difference"] = df["mslp_tahiti"] - df["mslp_darwin"]
    
    reference_mean = df_reference["difference"].mean()
    reference_std = df_reference["difference"].std()
    
    return ((df["difference"] - reference_mean) / reference_std).astype(float).round(1)

In [None]:
df_SOI_CRU = pd.read_csv("../Data/SOI_CRU.csv")
df_SOI_NOAA = pd.read_csv("../Data/SOI_NOAA.csv")
df_MSLP = pd.read_csv("../Data/MSLP_CRU.csv")

df = pd.DataFrame(
    data={
        "year": df_SOI_CRU["year"],
        "month": df_SOI_CRU["month"],
        "mslp_tahiti": df_MSLP["mslp_tahiti"],
        "mslp_darwin": df_MSLP["mslp_darwin"],
        "soi_CRU": df_SOI_CRU["soi"]
    }
)

df_reference = df.loc[df["year"].isin([year for year in range(1980,2010)])].copy()
df_reference["difference"] = df_reference["mslp_tahiti"] - df_reference["mslp_darwin"]

df = df.loc[(df["year"] >= 1966) & (df["year"] <= 2019)].reset_index(drop=True)
df["year"] = df["year"].astype(str)
df["month"] = df["month"].astype(str).str.rjust(2, '0')
df["soi_CRU"] = df["soi_CRU"].astype(float).round(1)
df["soi_NOAA"] = df_SOI_NOAA["soi"].astype(float).round(1)


# df["soi_reference"] = generate_soi_by_reference(df.copy(), df_reference.copy())
# df["soi_month"] = generate_soi_by_similar_month(df.copy(), df_reference.copy())
df["soi_roll_012"] = generate_soi_by_rolling(df.copy(), df_reference.copy(), 12)
df["soi_roll_024"] = generate_soi_by_rolling(df.copy(), df_reference.copy(), 24)
df["soi_roll_048"] = generate_soi_by_rolling(df.copy(), df_reference.copy(), 48)
df["soi_roll_120"] = generate_soi_by_rolling(df.copy(), df_reference.copy(), 120)
df["soi_difference"] = generate_soi_by_difference(df.copy(), df_reference.copy())

df

In [None]:
from sklearn.metrics import mean_squared_error 

df = df.dropna(axis=0)

print(f'rmse_sanity     {mean_squared_error(df["soi_CRU"], df["soi_CRU"]):.2f}')
print(f'rmse_cru_noaa   {mean_squared_error(df["soi_CRU"], df["soi_NOAA"]):.2f}')

# print(f'rmse_reference {mean_squared_error(df["soi_reference"], df["soi_CRU"]):.2f}')
# print(f'rmse_reference {mean_squared_error(df["soi_reference"], df["soi_NOAA"]):.2f}')

# print(f'rmse_month      {mean_squared_error(df["soi_month"], df["soi_CRU"]):.2f}')

print(f'rmse_roll_012   {mean_squared_error(df["soi_roll_012"], df["soi_CRU"]):.2f}')
print(f'rmse_roll_024   {mean_squared_error(df["soi_roll_024"], df["soi_CRU"]):.2f}')
print(f'rmse_roll_048   {mean_squared_error(df["soi_roll_048"], df["soi_CRU"]):.2f}')
print(f'rmse_roll_120   {mean_squared_error(df["soi_roll_120"], df["soi_CRU"]):.2f}')

print(f'rmse_difference  {mean_squared_error(df["soi_difference"], df["soi_CRU"]):.2f}')
print(f'rmse_difference  {mean_squared_error(df["soi_difference"], df["soi_NOAA"]):.2f}')

In [None]:
df_reference

# Try Apply SOI to existing processed data

In [None]:
COMMON_DIR = "/Volumes/T7/ExtremeWeather"
DATA_DIR_SIM = f"{COMMON_DIR}/Data_WeatherAtHome/climatology_1986-2014"
files = sorted(glob(f"{DATA_DIR_SIM}/Processed/year=2010/region=*/sim=*/model_tag=*/data.parquet"))

In [None]:
file_path = files[0]

year = re.findall(r"year=(.*?)/", file_path)[0]
region = re.findall(r"region=(.*?)/", file_path)[0]
sim_type = re.findall(r"sim=(.*?)/", file_path)[0]
model_tag = re.findall(r"model_tag=(.*?)/", file_path)[0]

df = pd.read_parquet(file_path)
df_soi = pd.read_parquet(f"../wah_soi/method=ref_model_world/sim={sim_type}/model_tag={model_tag}/")

In [None]:
group_cols = [
    "year",
    "month"
]

df.drop(columns=["soi"], inplace=True)

df = df.merge(
    right=df_soi[group_cols + ["soi"]],
    on=group_cols,
    how="left"
)

In [None]:
df

# Placeholder

In [None]:
df_wah = pd.read_parquet("../processed_monthly_WaH.parquet")

In [None]:
df_wah

In [None]:
df_cru = pd.read_csv("../Data/SOI_CRU.csv")
df_cru = df_cru.loc[df_cru["year"].between(1986.0, 2014.0)].reset_index(drop=True)
df_cru["year"] = df_cru["year"].astype(int)
df_cru["month"] = df_cru["month"].astype(int).astype(str).str.rjust(2, '0')
df_cru["soi"] = df_cru["soi"].astype(float).round(1)
df_cru

In [None]:
df_merged = df_wah.merge(
    right=df_cru,
    how="left",
    on=["year", "month"],
    suffixes=("_model", "_real")
)

df_merged["year_month"] = df_merged["year"].astype(str) + "_" + df_merged["month"].astype(str)
df_merged

In [None]:
df_merged.to_parquet("../processed_monthly_WaH_test.parquet")

In [None]:
df_subset

In [None]:
x_values = df_subset["year_month"].unique()

df_sub_ALL = df_subset.loc[df_subset["sim"] == "ALL"]
df_sub_NAT = df_subset.loc[df_subset["sim"] == "NAT"]

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_sub_ALL["year_month"],
        y=df_sub_ALL["soi_model"],
        marker={"color":"rgba(0, 0, 255, 0.5)"},
        # line={"color":"rgba(0, 0, 255, 0.5)"},
        # mode="markers+lines",
        mode="markers",
        name="ALL model SOI"
    )
)

fig.add_trace(
    go.Scatter(
        x=df_sub_NAT["year_month"],
        y=df_sub_NAT["soi_model"],
        marker={"color":"rgba(255, 0, 0, 0.5)"},
        # line={"color":"rgba(255, 0, 0, 0.5)"},
        # mode="markers+lines",
        mode="markers",
        name="NAT model SOI"
    )
)

fig.add_trace(
    go.Scatter(
        x=df_sub_ALL["year_month"],
        y=df_sub_ALL["soi_real"],
        marker={"color":"rgba(0, 200, 0, 1.0)"},
        # line={"color":"rgba(0, 200, 0, 1.0)"},
        # mode="markers+lines",
        mode="markers",
        name="Real world SOI"
    )
)

fig.update_layout(height=600)
fig