In [1]:
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pykrige.ok import OrdinaryKriging
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import random
from scipy.stats import mstats


In [3]:
ds_sst = xr.open_dataset('C:/Users/Acer/Documents/SchoolHard/Thesis/Code/dataset//sst_merged_2002_08_2022_10.nc')

In [4]:
ds_sst

In [5]:
#define time steps, 243 monthly observations
start_date = pd.Timestamp('2002-08-01')
end_date = pd.Timestamp('2022-10-01')

#Create a new time coordinate that represents the month and year
time_coords = pd.date_range(start=start_date, end=end_date, freq='MS')

In [6]:
n = 24
samples = random.sample(list(time_coords), n)


In [7]:
#Define the number of folds for cross validation
n_splits = 3

In [8]:
#Create a KFold object to split the data
kf = KFold(n_splits=n_splits)

In [9]:
#metrics list
mse_list = []
rmse_list = []
mae_list = []

In [10]:
# raw sst dataset test
for t in range(len(samples)):
    
    print('set: ' + str(t))
    
    monthly_data = ds_sst['sst'].sel(time= samples[t])
    lon = monthly_data['lon'].values
    lat = monthly_data['lat'].values
    
    #Convert the dataset to a pandas dataframe
    df_sst = monthly_data.to_dataframe().reset_index()
    
    #convert dataframe to geodataframe 
    gdf_sst = gpd.GeoDataFrame(df_sst, geometry=gpd.points_from_xy(df_sst.lon, df_sst.lat))
    
    #SET CRS to WGS84
    gdf_sst.crs = 'WGS84'
    
    #dropna in gdf
    gdf_sst = gdf_sst.dropna()
    
    #Get x, y, z values - this is the lon, lat, and variable/sst
    x = gdf_sst.geometry.x
    y = gdf_sst.geometry.y
    z = gdf_sst['sst']
    
    x_grid = np.linspace(lon.min(), lon.max(), num=400)
    y_grid = np.linspace(lat.min(), lat.max(), num=400)
    XI, YI = np.meshgrid(x_grid, y_grid)
    
    for train_index, test_index in kf.split(gdf_sst):
        #Train and Test splits
        train_gdf = gdf_sst.iloc[train_index].reset_index()
        test_gdf = gdf_sst.iloc[test_index].reset_index()

        #Train data x,y,z
        x_train = train_gdf.geometry.x
        y_train = train_gdf.geometry.y
        z_train = train_gdf['sst']

        #Ordinary Kriging
        orkrig = OrdinaryKriging(x_train, y_train, z_train, variogram_model="linear", verbose=False, enable_plotting=False)
    
        #Train data x,y,z
        x_test = test_gdf.geometry.x
        y_test = test_gdf.geometry.y
        z_test = test_gdf['sst']

        ZI_test = np.zeros_like(x_test)
        for i in range(len(x_test)):
            ZI_test[i], sigma = orkrig.execute("grid", x_test[i], y_test[i])

        #MSE -mean squared error
        mse = mean_squared_error(z_test, ZI_test)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(z_test, ZI_test)
    
        mse_list.append(mse)
        rmse_list.append(rmse)
        mae_list.append(mae)
    
    

set: 0
set: 1
set: 2
set: 3
set: 4
set: 5
set: 6
set: 7
set: 8
set: 9
set: 10
set: 11
set: 12
set: 13
set: 14
set: 15
set: 16
set: 17
set: 18
set: 19
set: 20
set: 21
set: 22
set: 23


In [11]:
# Compute the mean  of MSE MAE RMSE across all folds
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)

mean_rmse = np.mean(rmse_list)
std_rmse = np.std(rmse_list)

mean_mae = np.mean(mae_list)
std_mae = np.std(mae_list)

print(mean_mse)
print(mean_rmse)
print(mean_mae)

0.20402696723597438
0.4314662261591805
0.3229380564200308


In [12]:
# test for processed sst
for t in range(len(samples)):
    
    print('set: ' + str(t))
    
    monthly_data = ds_sst['sst'].sel(time= samples[t])
    lon = monthly_data['lon'].values
    lat = monthly_data['lat'].values
    
    #Convert the dataset to a pandas dataframe
    df_sst = monthly_data.to_dataframe().reset_index()
    
    #convert dataframe to geodataframe 
    gdf_sst = gpd.GeoDataFrame(df_sst, geometry=gpd.points_from_xy(df_sst.lon, df_sst.lat))
    
    #SET CRS to WGS84
    gdf_sst.crs = 'WGS84'
    
    #dropna in gdf
    gdf_sst = gdf_sst.dropna()
    
    # Define the percentile values for Winsorization
    lower_percentile = 1
    upper_percentile = 99

    # Get the lower and upper limits
    lower_limit, upper_limit = np.percentile(gdf_sst['sst'], [lower_percentile, upper_percentile])

    # Make 'sst_final' column of the geodataframe with the winsorized data
    gdf_sst['sst_final'] = np.clip(gdf_sst['sst'], lower_limit, upper_limit)
    
    #Get x, y, z values - this is the lon, lat, and variable/sst
    x = gdf_sst.geometry.x
    y = gdf_sst.geometry.y
    z = gdf_sst['sst_final']
    
    x_grid = np.linspace(lon.min(), lon.max(), num=400)
    y_grid = np.linspace(lat.min(), lat.max(), num=400)
    XI, YI = np.meshgrid(x_grid, y_grid)
    
    for train_index, test_index in kf.split(gdf_sst):
        #Train and Test splits
        train_gdf = gdf_sst.iloc[train_index].reset_index()
        test_gdf = gdf_sst.iloc[test_index].reset_index()

        #Train data x,y,z
        x_train = train_gdf.geometry.x
        y_train = train_gdf.geometry.y
        z_train = train_gdf['sst_final']

        #Ordinary Kriging
        orkrig = OrdinaryKriging(x_train, y_train, z_train, variogram_model="linear", verbose=False, enable_plotting=False)
    
        #Train data x,y,z
        x_test = test_gdf.geometry.x
        y_test = test_gdf.geometry.y
        z_test = test_gdf['sst_final']

        ZI_test = np.zeros_like(x_test)
        for i in range(len(x_test)):
            ZI_test[i], sigma = orkrig.execute("grid", x_test[i], y_test[i])

        #MSE -mean squared error
        mse = mean_squared_error(z_test, ZI_test)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(z_test, ZI_test)
    
        mse_list.append(mse)
        rmse_list.append(rmse)
        mae_list.append(mae)

set: 0
set: 1
set: 2
set: 3
set: 4
set: 5
set: 6
set: 7
set: 8
set: 9
set: 10
set: 11
set: 12
set: 13
set: 14
set: 15
set: 16
set: 17
set: 18
set: 19
set: 20
set: 21
set: 22
set: 23


In [13]:
# Compute the mean  of MSE MAE RMSE across all folds for processed sst dataset
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)

mean_rmse = np.mean(rmse_list)
std_rmse = np.std(rmse_list)

mean_mae = np.mean(mae_list)
std_mae = np.std(mae_list)

print(mean_mse)
print(mean_rmse)
print(mean_mae)

0.16882240849026436
0.39057407680409184
0.302609445398738


In [14]:
```
# Raw SST data
0.25285170143459307
0.47572158851718543
0.3561845546268448

# Winsorized data
0.20762421093360364
0.4311379199039492
0.3325764183969902

              sst   sst_final
count  929.000000  929.000000
mean    29.805275   29.800429
std      0.520612    0.457136
min     28.084999   29.127000
25%     29.469999   29.469999
50%     29.709999   29.709999
75%     30.054998   30.054998
max     32.000000   30.830000

SyntaxError: invalid syntax (117921531.py, line 1)

In [None]:
print(pd.concat([gdf_sst['sst'].describe(), gdf_sst['sst_final'].describe()], axis=1))