In [1]:
import geopandas as gpd
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pykrige.uk import UniversalKriging
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import random
from scipy.interpolate import NearestNDInterpolator
import os

In [12]:
from global_land_mask import globe
import statsmodels.api as sm

In [5]:
ds = xr.open_dataset('C:/Users/Acer/Documents/SchoolHard/Thesis/Code/dataset//chl_merged_2002_08_2022_10.nc')

In [None]:
ds

In [6]:
# Set the directory path
dir_path = "C:/Users/Acer/Documents/SchoolHard/Thesis/Code/fin_csv/UniKrig"

# Get all file names inside the directory
file_names = os.listdir(dir_path)

# Concatenate the file names with the directory path
file_paths = [os.path.join(dir_path, file_name) for file_name in file_names]

#dummy_df = pd.read_csv(file_paths[0])

In [7]:
#define time steps, 243 monthly observations
start_date = pd.Timestamp('2002-08-01')
end_date = pd.Timestamp('2022-10-01')

#Create a new time coordinate that represents the month and year
time_coords = pd.date_range(start=start_date, end=end_date, freq='MS')

In [8]:
#2022 months
time_coords_2022 = time_coords[time_coords.year == 2022]

# Extract the month from each timestamp
months_2022 = time_coords_2022.month

# Print the resulting integer for each month in 2022
for month in months_2022:
    print(month)

1
2
3
4
5
6
7
8
9
10


In [9]:
time_coords_2022

DatetimeIndex(['2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01',
               '2022-05-01', '2022-06-01', '2022-07-01', '2022-08-01',
               '2022-09-01', '2022-10-01'],
              dtype='datetime64[ns]', freq='MS')

In [10]:
print(len(time_coords_2022))

10


In [14]:
for i in range(len(time_coords_2022)):
    
    mean_data = pd.read_csv(file_paths[i])
    
    month_sst = ds['chlor_a'].sel(time= time_coords_2022[i])
    
    lon = ds['lon'].values
    lat = ds['lat'].values
    
    #Convert the dataset to a pandas dataframe
    df = month_sst.to_dataframe().reset_index()
    
    #convert dataframe to geodataframe 
    gdf = gpd.GeoDataFrame(
    df, 
    geometry=gpd.points_from_xy(df.lon, df.lat)
    )
    
    # Set the CRS of the geodataframe
    gdf.crs = 'WGS84'
    
    #dropna in gdf
    gdf = gdf.dropna()
    
    #winzorization
    #winzorize month_gdf before interpolation
    # Calculate winsorized values
    winsorized = np.clip(gdf['chlor_a'], gdf['chlor_a'].quantile(0.05), gdf['chlor_a'].quantile(0.95))

    # Define Huber loss function
    def huber_loss(residuals, c=1.345):
        return np.where(abs(residuals) < c, 0.5 * residuals ** 2, c * (abs(residuals) - 0.5 * c))

    # Define M-estimator function
    def m_estimator(data, loss_function, tuning_param):
        # Add a constant column to serve as the intercept
        exog = sm.add_constant(data)
        model = sm.RLM(gdf['chlor_a'], exog=exog, M=sm.robust.norms.HuberT(t=tuning_param))
        results = model.fit()
        return results.fittedvalues

    # Apply M-estimator function to winsorized data
    final_values = m_estimator(winsorized, huber_loss, 1.345)

    # Add final values as a new column to your original GeoDataFrame
    gdf['chlor_a'] = final_values

    # Convert the 'final_values' column to float data type if necessary
    gdf['chlor_a'] = gdf['chlor_a'].astype(float)
    
    # Extract X, Y, and Z values
    x = gdf.geometry.x
    y = gdf.geometry.y
    z = gdf['chlor_a']
    
    #Universal Kriging
    unkrig = UniversalKriging(x, y, z, variogram_model="linear", verbose=False, enable_plotting=False)
    
    x_grid = np.linspace(lon.min(), lon.max(), num=400)
    y_grid = np.linspace(lat.min(), lat.max(), num=400)
    XI, YI = np.meshgrid(x_grid, y_grid)
    
    z_interp, sigma = unkrig.execute("grid", x_grid, y_grid)
    
    # Create new GeoDataFrame with interpolated values
    interp_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(XI.ravel(), YI.ravel()))
    interp_gdf['estimated_chla'] = z_interp.ravel()

    # add lat and lon columns
    interp_gdf['lat'] = interp_gdf['geometry'].apply(lambda p: p.y)
    interp_gdf['lon'] = interp_gdf['geometry'].apply(lambda p: p.x)

    estimated_chla = interp_gdf.pop('estimated_chla')
    interp_gdf['estimated_chla'] = estimated_chla

    idf = pd.DataFrame(interp_gdf.drop('geometry', axis=1))

    #mask land area on dataframe version of interpolated results
    mask = globe.is_land(idf['lat'], idf['lon'])
    final_idf = idf[~mask]

    #mask land area on geodataframe of interpolated results
    mask = globe.is_land(interp_gdf['lat'], interp_gdf['lon'])
    interp_gdf = interp_gdf[~mask]

    interp_gdf = interp_gdf.reset_index()
    
    mse = mean_squared_error(interp_gdf['estimated_chla'], mean_data['mean_chla'])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(interp_gdf['estimated_chla'], mean_data['mean_chla'])

    print("Month " + str(i) + ": " + str(mse))
    print("Month " + str(i) + ": " + str(rmse))
    print("Month " + str(i) + ": " + str(mae))

Month 0: 0.002543025957762424
Month 0: 0.05042842410548266
Month 0: 0.02386449417234882
Month 1: 0.00833610111495924
Month 1: 0.09130225142327675
Month 1: 0.05558620563343114
Month 2: 0.03346056785186741
Month 2: 0.1829223000398459
Month 2: 0.10290793002984833
Month 3: 0.004630139582910748
Month 3: 0.06804512901678376
Month 3: 0.03799233261945022
Month 4: 0.041124415123277545
Month 4: 0.20279155584806174
Month 4: 0.09974258560545905
Month 5: 0.02271953442665323
Month 5: 0.15073000506419826
Month 5: 0.07037802029394498
Month 6: 0.04232827271257395
Month 6: 0.2057383598470979
Month 6: 0.12333195709379786
Month 7: 0.09504169107432944
Month 7: 0.30828832458322103
Month 7: 0.14395486343623234
Month 8: 0.005300298806320167
Month 8: 0.07280315107411332
Month 8: 0.04419601437266832
Month 9: 0.00957527349871384
Month 9: 0.09785332645706961
Month 9: 0.05854489020216362
