In [34]:
import pandas as pd
import numpy as np
import geopandas as gpd
import xarray as xr
from pykrige.uk import UniversalKriging
from global_land_mask import globe
from scipy.ndimage import gaussian_filter
import warnings
import statsmodels.api as sm
import os

warnings.filterwarnings("ignore")

In [35]:
ds = xr.open_dataset('C:/Users/Acer/Documents/SchoolHard/Thesis/Code/dataset//sst_merged_2002_08_2022_10.nc')
chl_ds = xr.open_dataset('C:/Users/Acer/Documents/SchoolHard/Thesis/Code/dataset//chl_merged_2002_08_2022_10.nc')

In [36]:
#define time steps, 243 monthly observations
start_date = pd.Timestamp('2002-08-01')
end_date = pd.Timestamp('2022-10-01')

#Create a new time coordinate that represents the month and year
time_coords = pd.date_range(start=start_date, end=end_date, freq='MS')

In [37]:
#Select variable and extract time slice
sample_sst = ds['sst'].sel(time= time_coords[0])

lon = sample_sst['lon'].values
lat = sample_sst['lat'].values

get_idf = pd.DataFrame({})
#tf_masked = pd.DataFrame({})

In [38]:
def thermal_mask (df) :
    
    #threshold value based on the distribution of gradient magnitudes
    thresh_low = np.percentile(df["thermal_mag"], 25)
    thresh_high = np.percentile(df["thermal_mag"], 75)
    
    #bad, moderate, best in terms of thermal front condition
    bad_tf = df["thermal_mag"] <= thresh_low
    moderate_tf = (df["thermal_mag"] >= thresh_low) & (df["thermal_mag"] <= thresh_high)
    best_tf = df["thermal_mag"] >= thresh_high
    
    idf = df
    
    btf = bad_tf.ravel()
    mtf = moderate_tf.ravel()
    betf = best_tf.ravel()
    
    idf['btf'] = btf
    idf['mtf'] = mtf
    idf['betf'] = betf
    
    #condition to convert btf, mtf, betf to one column with custom labels
    for index, row in idf.iterrows():
        if row['btf'] == True :
            idf.at[index, 'thermal_mask'] = 'low'
        elif row['mtf'] == True :
            idf.at[index, 'thermal_mask'] = 'moderate'
        elif row['betf'] == True :
            idf.at[index, 'thermal_mask'] = 'high'
        else :
            print("all false, no data")
            idf.at[index, 'thermal_mask'] = 'no data'
    
    return idf['thermal_mask']

In [39]:
#kriging function
def krig_function(gdf):
    
    # Extract X, Y, and Z values
    x = gdf.geometry.x
    y = gdf.geometry.y
    z = gdf['sst']
        
    x_grid = np.linspace(lon.min(), lon.max(), num=400)
    y_grid = np.linspace(lat.min(), lat.max(), num=400)
    XI, YI = np.meshgrid(x_grid, y_grid)
        
    unkrig = UniversalKriging(x, y, z, variogram_model="linear", verbose=False, enable_plotting=False)
    
    z_interp, sigma = unkrig.execute("grid", x_grid, y_grid)
    
    # Create new GeoDataFrame with interpolated values
    interp_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(XI.ravel(), YI.ravel()))
    interp_gdf['estimated_sst'] = z_interp.ravel()
    
    # add lat and lon columns
    interp_gdf['lat'] = interp_gdf['geometry'].apply(lambda p: p.y)
    interp_gdf['lon'] = interp_gdf['geometry'].apply(lambda p: p.x)
    
    estimated_sst = interp_gdf.pop('estimated_sst')
    interp_gdf['estimated_sst'] = estimated_sst
    
    #mask land area on geodataframe of interpolated results
    mask = globe.is_land(interp_gdf['lat'], interp_gdf['lon'])
    full_interp_gdf = interp_gdf
    interp_gdf = interp_gdf[~mask]
    interp_gdf = interp_gdf.reset_index()
    
    return interp_gdf, full_interp_gdf

In [48]:
#kriging function
def krig_function_chl(gdf):
    
    # Extract X, Y, and Z values
    x = gdf.geometry.x
    y = gdf.geometry.y
    z = gdf['chlor_a']
        
    x_grid = np.linspace(lon.min(), lon.max(), num=400)
    y_grid = np.linspace(lat.min(), lat.max(), num=400)
    XI, YI = np.meshgrid(x_grid, y_grid)
        
    unkrig = UniversalKriging(x, y, z, variogram_model="linear", verbose=False, enable_plotting=False)
    
    z_interp, sigma = unkrig.execute("grid", x_grid, y_grid)
    
    # Create new GeoDataFrame with interpolated values
    interp_gdf = gpd.GeoDataFrame(geometry=gpd.points_from_xy(XI.ravel(), YI.ravel()))
    interp_gdf['estimated_chl'] = z_interp.ravel()
    
    # add lat and lon columns
    interp_gdf['lat'] = interp_gdf['geometry'].apply(lambda p: p.y)
    interp_gdf['lon'] = interp_gdf['geometry'].apply(lambda p: p.x)
    
    estimated_chl = interp_gdf.pop('estimated_chl')
    interp_gdf['estimated_chl'] = estimated_chl
    
    #mask land area on geodataframe of interpolated results
    mask = globe.is_land(interp_gdf['lat'], interp_gdf['lon'])
    full_interp_gdf = interp_gdf
    interp_gdf = interp_gdf[~mask]
    interp_gdf = interp_gdf.reset_index()
    
    return interp_gdf, full_interp_gdf

In [49]:
#gradient analysis for thermal front
def thermal_front (idf) :
    
    sst_data = idf['estimated_sst'].values.reshape((400, 400))
    sst_grad = np.gradient(sst_data)
    
    #Calculate the gradient magnitude
    sst_mag = np.sqrt(np.square(sst_grad[0]) + np.square(sst_grad[1]))
    
    #flatten sst_mag_smooth then add back to idf
    sms_col = sst_mag.ravel()
    new_df = idf
    new_df['thermal_mag'] = sms_col
    
   
    #mask land in new dataframe
    mask = globe.is_land(idf['lat'], idf['lon'])
    tf_masked = new_df[~mask]

    tf_masked = tf_masked.reset_index()
    tf_masked.pop('index')

    #return created column as a series
    return tf_masked['thermal_mag']


In [50]:
def thermal_mask (df) :
    
    #threshold value based on the distribution of gradient magnitudes
    thresh_low = np.percentile(df["mean_tm"], 25)
    thresh_high = np.percentile(df["mean_tm"], 75)
    
    #bad, moderate, best in terms of thermal front condition
    bad_tf = df["mean_tm"] <= thresh_low
    moderate_tf = (df["mean_tm"] >= thresh_low) & (df["mean_tm"] <= thresh_high)
    best_tf = df["mean_tm"] >= thresh_high
    
    idf = df
    
    btf = bad_tf.ravel()
    mtf = moderate_tf.ravel()
    betf = best_tf.ravel()
    
    idf['btf'] = btf
    idf['mtf'] = mtf
    idf['betf'] = betf
    
    #condition to convert btf, mtf, betf to one column with custom labels
    for index, row in idf.iterrows():
        if row['btf'] == True :
            idf.at[index, 'thermal_mask'] = 'low'
        elif row['mtf'] == True :
            idf.at[index, 'thermal_mask'] = 'moderate'
        elif row['betf'] == True :
            idf.at[index, 'thermal_mask'] = 'high'
        else :
            print("all false, no data")
            idf.at[index, 'thermal_mask'] = 'no data'
    
    return idf['thermal_mask']

In [43]:

months = np.arange(1, 13)

for month in months:
    month_mask = time_coords.month == month
    month_indices = np.where(month_mask)[0]
    
    prev = gpd.GeoDataFrame({})
    trigger = 1
    
    for i in range(len(month_indices)):
        
        month_num = month_indices[i]
        print(month_num)
        month_sst = ds['sst'].sel(time= time_coords[month_num])
        month_df = month_sst.to_dataframe().reset_index()
        month_df = month_df.dropna()
        
        month_gdf = gpd.GeoDataFrame(month_df, geometry=gpd.points_from_xy(month_df.lon, month_df.lat))
        month_gdf.crs = 'WGS84'
        
        #winzorization
        #winzorize month_gdf before interpolation
        # Define the percentile values for Winsorization
        lower_percentile = 1
        upper_percentile = 99

        # Get the lower and upper limits
        lower_limit, upper_limit = np.percentile(month_gdf['sst'], [lower_percentile, upper_percentile])

        # Update 'sst' column of the geodataframe with the winsorized data
        month_gdf['sst'] = np.clip(month_gdf['sst'], lower_limit, upper_limit)
        
        month_gdf, get_idf = krig_function(month_gdf)
        
        #add thermal front data
        month_gdf['thermal_mag'] = thermal_front(get_idf)
        
        print('interpolated')
        
        if trigger != 1 :
            
            overlay_gdf = gpd.overlay(prev, month_gdf, how='intersection')
        
            prev = overlay_gdf
            prev.pop('lon_2')
            prev.pop('lat_2')
            prev = prev.rename(columns={'lon_1': 'lon', 'lat_1': 'lat' , 'estimated_sst_1': 'estimated_sst' , 'estimated_sst_2': 'estimated_sst',
                                       'thermal_mag_1': 'thermal_mag' , 'thermal_mag_2': 'thermal_mag'})
            print('passed')
            
        else:
            prev = month_gdf
            trigger = 0
            print('passed')
    
    prev['mean_sst'] = prev.filter(like='sst').mean(axis=1)
    prev['mean_tm'] = prev.filter(like='thermal_mag').mean(axis=1)
    prev['thermal_mask'] = thermal_mask(prev)
    
    
    #get lat,lon,mean_sst before saving to csv
    final_save = prev.loc[:, ['lat', 'lon', 'mean_sst', 'mean_tm', 'thermal_mask']]
    
    print('saving')
    #save to csv
    final_save.to_csv('C:/Users/Acer/Documents/SchoolHard/Thesis/Code/dataset//sst_' + str(month) + '.csv', index=False)

5
interpolated
passed
17
interpolated
passed
29
interpolated
passed
41
interpolated
passed
53
interpolated
passed
65
interpolated
passed
77
interpolated
passed
89
interpolated
passed
101
interpolated
passed
113
interpolated
passed
125
interpolated
passed
137
interpolated
passed
149
interpolated
passed
161
interpolated
passed
173
interpolated
passed
185
interpolated
passed
197
interpolated
passed
209
interpolated
passed
221
interpolated
passed
233
interpolated
passed
saving
6
interpolated
passed
18
interpolated
passed
30
interpolated
passed
42
interpolated
passed
54
interpolated
passed
66
interpolated
passed
78
interpolated
passed
90
interpolated
passed
102
interpolated
passed
114
interpolated
passed
126
interpolated
passed
138
interpolated
passed
150
interpolated
passed
162
interpolated
passed
174
interpolated
passed
186
interpolated
passed
198
interpolated
passed
210
interpolated
passed
222
interpolated
passed
234
interpolated
passed
saving
7
interpolated
passed
19
interpolated
passed

In [51]:

months = np.arange(1, 13)

for month in months:
    month_mask = time_coords.month == month
    month_indices = np.where(month_mask)[0]
    
    prev = gpd.GeoDataFrame({})
    trigger = 1
    
    for i in range(len(month_indices)):
        
        month_num = month_indices[i]
        print(month_num)
        month_chla = chl_ds['chlor_a'].sel(time= time_coords[month_num])
        month_df = month_chla.to_dataframe().reset_index()
        month_df = month_df.dropna()
        
        month_gdf = gpd.GeoDataFrame(month_df, geometry=gpd.points_from_xy(month_df.lon, month_df.lat))
        month_gdf.crs = 'WGS84'
        
        #winzorization
        #winzorize month_gdf before interpolation
        # Calculate winsorized values
        winsorized = np.clip(month_gdf['chlor_a'], month_gdf['chlor_a'].quantile(0.05), month_gdf['chlor_a'].quantile(0.95))

        # Define Huber loss function
        def huber_loss(residuals, c=1.345):
            return np.where(abs(residuals) < c, 0.5 * residuals ** 2, c * (abs(residuals) - 0.5 * c))

        # Define M-estimator function
        def m_estimator(data, loss_function, tuning_param):
            # Add a constant column to serve as the intercept
            exog = sm.add_constant(data)
            model = sm.RLM(month_gdf['chlor_a'], exog=exog, M=sm.robust.norms.HuberT(t=tuning_param))
            results = model.fit()
            return results.fittedvalues

        # Apply M-estimator function to winsorized data
        final_values = m_estimator(winsorized, huber_loss, 1.345)

        # Add final values as a new column to your original GeoDataFrame
        month_gdf['chlor_a'] = final_values

        # Convert the 'final_values' column to float data type if necessary
        month_gdf['chlor_a'] = month_gdf['chlor_a'].astype(float)
        
        month_gdf, get_idf = krig_function_chl(month_gdf)
        
        print('interpolated')
        
        if trigger != 1 :
            
            overlay_gdf = gpd.overlay(prev, month_gdf, how='intersection')
        
            prev = overlay_gdf
            prev.pop('lon_2')
            prev.pop('lat_2')
            prev = prev.rename(columns={'lon_1': 'lon', 'lat_1': 'lat' , 'estimated_chl_1': 'estimated_chl' , 'estimated_chl_2': 'estimated_chl',
                                       'thermal_mag_1': 'thermal_mag' , 'thermal_mag_2': 'thermal_mag'})
            print('passed')
            
        else:
            prev = month_gdf
            trigger = 0
            print('passed')
    
    prev['mean_chla'] = prev.filter(like='chl').mean(axis=1)
    
    #get lat,lon,mean_chl before saving to csv
    final_save_chl = prev.loc[:, ['lat', 'lon', 'mean_chla']]
    
    print('saving')
    #save to csv
    final_save_chl.to_csv('C:/Users/Acer/Documents/SchoolHard/Thesis/Code/dataset//chla_' + str(month) + '.csv', index=False)

5
interpolated
passed
17
interpolated
passed
29
interpolated
passed
41
interpolated
passed
53
interpolated
passed
65
interpolated
passed
77
interpolated
passed
89
interpolated
passed
101
interpolated
passed
113
interpolated
passed
125
interpolated
passed
137
interpolated
passed
149
interpolated
passed
161
interpolated
passed
173
interpolated
passed
185
interpolated
passed
197
interpolated
passed
209
interpolated
passed
221
interpolated
passed
233
interpolated
passed
saving
6
interpolated
passed
18
interpolated
passed
30
interpolated
passed
42
interpolated
passed
54
interpolated
passed
66
interpolated
passed
78
interpolated
passed
90
interpolated
passed
102
interpolated
passed
114
interpolated
passed
126
interpolated
passed
138
interpolated
passed
150
interpolated
passed
162
interpolated
passed
174
interpolated
passed
186
interpolated
passed
198
interpolated
passed
210
interpolated
passed
222
interpolated
passed
234
interpolated
passed
saving
7
interpolated
passed
19
interpolated
passed

In [None]:
#pa add ra ko for loop drew merge nimo ang sst og chla na csv 
#same gihapon isave as csv ang na merge na sst og chla


In [52]:
# JOIN SST AND SSC CSVs BY MONTH
# Define the directory path where the CSV files are stored
directory = 'C:/Users/Acer/Documents/SchoolHard/Thesis/Code/fin_csv'

# Get a list of all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
print(csv_files)

# Initialize a variable to keep track of the number of merged files
num_merged_files = 0

# Iterate over all SST CSV files and match them with corresponding CHL CSV files
for sst_file in csv_files:
    if sst_file.startswith('sst_'):
        print(sst_file)
        
        sst_num = sst_file[4:-4]
        chl_file = f"chla_{sst_num}.csv"
        print(sst_num)
        print(chl_file)
        
        if chl_file in csv_files:
            try:
                # Read in the SST and CHL CSV files as dataframes
                sst_df = pd.read_csv(os.path.join(directory, sst_file))
                
                chl_df = pd.read_csv(os.path.join(directory, chl_file))

                # Merge the two dataframes based on a common column
                merged_df = pd.merge(sst_df, chl_df, on=['lat', 'lon'])

                # Drop columns that have the same column names
                merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]

                # Write the merged dataframe to a new CSV file
                merged_file_name = f"sst_chl_{sst_num}.csv"
                merged_df.to_csv(os.path.join(directory, merged_file_name), index=False)

                # Increment the counter for the number of merged files
                num_merged_files += 1

                print(f"Merged {sst_file} and {chl_file} into {merged_file_name}")
            except Exception as e:
                print(f"Error merging {sst_file} and {chl_file}: {e}")

# Print a message indicating the number of files that were successfully merged
print(f"Successfully merged {num_merged_files} files.")

['chla_1.csv', 'chla_10.csv', 'chla_11.csv', 'chla_12.csv', 'chla_2.csv', 'chla_3.csv', 'chla_4.csv', 'chla_5.csv', 'chla_6.csv', 'chla_7.csv', 'chla_8.csv', 'chla_9.csv', 'sst_1.csv', 'sst_10.csv', 'sst_11.csv', 'sst_12.csv', 'sst_2.csv', 'sst_3.csv', 'sst_4.csv', 'sst_5.csv', 'sst_6.csv', 'sst_7.csv', 'sst_8.csv', 'sst_9.csv']
sst_1.csv
1
chla_1.csv
Merged sst_1.csv and chla_1.csv into sst_chl_1.csv
sst_10.csv
10
chla_10.csv
Merged sst_10.csv and chla_10.csv into sst_chl_10.csv
sst_11.csv
11
chla_11.csv
Merged sst_11.csv and chla_11.csv into sst_chl_11.csv
sst_12.csv
12
chla_12.csv
Merged sst_12.csv and chla_12.csv into sst_chl_12.csv
sst_2.csv
2
chla_2.csv
Merged sst_2.csv and chla_2.csv into sst_chl_2.csv
sst_3.csv
3
chla_3.csv
Merged sst_3.csv and chla_3.csv into sst_chl_3.csv
sst_4.csv
4
chla_4.csv
Merged sst_4.csv and chla_4.csv into sst_chl_4.csv
sst_5.csv
5
chla_5.csv
Merged sst_5.csv and chla_5.csv into sst_chl_5.csv
sst_6.csv
6
chla_6.csv
Merged sst_6.csv and chla_6.csv into