In [10]:
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pykrige.uk import UniversalKriging
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import random
from scipy.interpolate import NearestNDInterpolator
from global_land_mask import globe
import os

In [4]:
from global_land_mask import globe

In [2]:
ds = xr.open_dataset('C:/Users/Acer/Documents/SchoolHard/Thesis/Code/dataset//sst_merged_2002_08_2022_10.nc')

In [6]:
ds

In [3]:
# Set the directory path
dir_path = "C:/Users/Acer/Documents/SchoolHard/Thesis/Code/fin_csv/UniKrig"

# Get all file names inside the directory
file_names = os.listdir(dir_path)

# Concatenate the file names with the directory path
file_paths = [os.path.join(dir_path, file_name) for file_name in file_names]

#dummy_df = pd.read_csv(file_paths[0])

In [4]:
#define time steps, 243 monthly observations
start_date = pd.Timestamp('2002-08-01')
end_date = pd.Timestamp('2022-10-01')

#Create a new time coordinate that represents the month and year
time_coords = pd.date_range(start=start_date, end=end_date, freq='MS')

In [5]:
#2022 months
time_coords_2022 = time_coords[time_coords.year == 2022]

# Extract the month from each timestamp
months_2022 = time_coords_2022.month

# Print the resulting integer for each month in 2022
for month in months_2022:
    print(month)

1
2
3
4
5
6
7
8
9
10


In [6]:
time_coords_2022

DatetimeIndex(['2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01',
               '2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01',
               '2022-09-01', '2022-10-01'],
              dtype='datetime64[ns]', freq='MS')

In [7]:
print(len(time_coords_2022))

10


In [11]:
for i in range(len(time_coords_2022)):
    
    mean_data = pd.read_csv(file_paths[i])
    
    month_sst = ds['sst'].sel(time= time_coords_2022[i])
    
    lon = ds['lon'].values
    lat = ds['lat'].values
    
    #Convert the dataset to a pandas dataframe
    df = month_sst.to_dataframe().reset_index()
    
    #convert dataframe to geodataframe 
    gdf = gpd.GeoDataFrame(
    df, 
    geometry=gpd.points_from_xy(df.lon, df.lat)
    )
    
    # Set the CRS of the geodataframe
    gdf.crs = 'WGS84'
    
    #dropna in gdf
    gdf = gdf.dropna()
    
    #winzorization
    #winzorize month_gdf before interpolation
    # Define the percentile values for Winsorization
    lower_percentile = 1
    upper_percentile = 99

    # Get the lower and upper limits
    lower_limit, upper_limit = np.percentile(gdf['sst'], [lower_percentile, upper_percentile])

    # Update 'sst' column of the geodataframe with the winsorized data
    gdf['sst'] = np.clip(gdf['sst'], lower_limit, upper_limit)
    
    # Extract X, Y, and Z values
    x = gdf.geometry.x
    y = gdf.geometry.y
    z = gdf['sst']
    
    #Universal Kriging
    unkrig = UniversalKriging(x, y, z, variogram_model="linear", verbose=False, enable_plotting=False)
    
    x_grid = np.linspace(lon.min(), lon.max(), num=400)
    y_grid = np.linspace(lat.min(), lat.max(), num=400)
    XI, YI = np.meshgrid(x_grid, y_grid)
    
    z_interp, sigma = unkrig.execute("grid", x_grid, y_grid)
    
    # Create new GeoDataFrame with interpolated values
    interp_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(XI.ravel(), YI.ravel()))
    interp_gdf['estimated_sst'] = z_interp.ravel()

    # add lat and lon columns
    interp_gdf['lat'] = interp_gdf['geometry'].apply(lambda p: p.y)
    interp_gdf['lon'] = interp_gdf['geometry'].apply(lambda p: p.x)

    estimated_sst = interp_gdf.pop('estimated_sst')
    interp_gdf['estimated_sst'] = estimated_sst

    idf = pd.DataFrame(interp_gdf.drop('geometry', axis=1))

    #mask land area on dataframe version of interpolated results
    mask = globe.is_land(idf['lat'], idf['lon'])
    final_idf = idf[~mask]

    #mask land area on geodataframe of interpolated results
    mask = globe.is_land(interp_gdf['lat'], interp_gdf['lon'])
    interp_gdf = interp_gdf[~mask]

    interp_gdf = interp_gdf.reset_index()
    
    mse = mean_squared_error(interp_gdf['estimated_sst'], mean_data['mean_sst'])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(interp_gdf['estimated_sst'], mean_data['mean_sst'])

    print("Month " + str(i) + ": " + str(mse))
    print("Month " + str(i) + ": " + str(rmse))
    print("Month " + str(i) + ": " + str(mae))

Month 0: 0.16052173496298128
Month 0: 0.4006516379137633
Month 0: 0.3456014847630625
Month 1: 1.5565665401226976
Month 1: 1.247624358580217
Month 1: 1.2290819962580835
Month 2: 0.5337341435762399
Month 2: 0.7305711078165081
Month 2: 0.6762629462116692
Month 3: 0.5181688622389817
Month 3: 0.7198394697701577
Month 3: 0.6462068475242503
Month 4: 2.597801239876185
Month 4: 1.6117695988807412
Month 4: 1.577359132551085
Month 5: 2.8495976660609705
Month 5: 1.6880751363789974
Month 5: 1.6724100705652345
Month 6: 1.1507390899682797
Month 6: 1.0727250766008407
Month 6: 1.0413062899621794
Month 7: 0.06517772016052753
Month 7: 0.25529927567568134
Month 7: 0.22249942252100563
Month 8: 0.13346609137206095
Month 8: 0.3653301128733586
Month 8: 0.2931614015431357
Month 9: 0.3116433162011422
Month 9: 0.5582502272289213
Month 9: 0.5319432414583737
