In [None]:
import yaml

import numpy as np
import pandas as pd

from Toy_ML import training, inference, poly_fit


In [4]:
with open("config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)

## 1. Pre-processing

### 1.1. Load the GloDAP data

In [2]:
# Name of columns
latitude_field = "Latitude [degrees North]"
longitude_field = "Longitude [degrees East]"
salinity_field = "SALNTY [PSS-78]"
oxygen_iso_field = "O18/O16 [/MILLE]"
reference_field = "Sample ID:INTEGER"
dt_format = "yyyy-mm-ddThh:mm:ss.sss"
temperature_field = "TEMPERATURE [DEG C]"

In [11]:
# --- Load CSV ---
df = pd.read_csv(CONFIG["glodap"])

# --- Clean data ---
# Replace missing or placeholder values (like '**') with NaN
df = df.replace("**", np.nan)
df = df.dropna(subset=[longitude_field, latitude_field, salinity_field, oxygen_iso_field,temperature_field])


In [12]:
# Convert columns to numeric if necessary
df[longitude_field] = pd.to_numeric(df[longitude_field])
df[latitude_field] = pd.to_numeric(df[latitude_field])
df[salinity_field] = pd.to_numeric(df[salinity_field])
df[oxygen_iso_field] = pd.to_numeric(df[oxygen_iso_field])

df["datetime"] = pd.to_datetime(df[dt_format])
df = df.drop(columns=dt_format)


In [13]:
df

Unnamed: 0,SALNTY [PSS-78],O18/O16 [/MILLE],TEMPERATURE [DEG C],Station,Sample ID:INTEGER,Longitude [degrees East],Latitude [degrees North],DEPTH [M],datetime
0,34.098999,0.250,-1.65760,33,1,31.4030,81.27200,3.0,1987-07-07 17:29:59
1,34.102001,0.110,-1.66360,33,2,31.4030,81.27200,9.0,1987-07-07 17:29:59
2,34.151001,0.240,-1.64260,33,3,31.4030,81.27200,20.0,1987-07-07 17:29:59
3,34.453999,0.140,-1.13470,33,4,31.4030,81.27200,40.0,1987-07-07 17:29:59
4,34.675999,0.270,-0.45190,33,6,31.4030,81.27200,59.0,1987-07-07 17:29:59
...,...,...,...,...,...,...,...,...,...
6905,34.816898,0.133,1.63920,10285,60,8.7277,82.45316,148.0,2021-09-11 02:43:00
6906,34.824066,0.360,1.48870,10285,63,8.7277,82.45316,198.0,2021-09-11 02:43:00
6907,34.850967,0.422,1.48385,10285,67,8.7277,82.45316,248.0,2021-09-11 02:43:00
6908,34.883247,0.463,1.51850,10285,75,8.7277,82.45316,297.0,2021-09-11 02:43:00


## 2. Train the model

In [None]:
# Create the 90% for training & 10% for inference
df_10 = df.sample(frac=0.10)  # 10% sample
df_90 = df.drop(df_10.index)  # get rest