In [13]:
import pandas as pd
import numpy as np

from scipy.spatial import cKDTree

import os

import netCDF4 as nc
import xarray as xr

import re
from glob import glob

import time

file_path = '../../01_data/01_biological_data'
file_name = 'metadata.tsv'

sat_data_path = '../../01_data/00_satellite_data'

file = os.path.join(file_path, file_name)
md = pd.read_csv(file, sep='\t', index_col=0)

md_srf = md[md.Layer == 'SRF'].copy()

md_srf['Event.date.YYYYMM'] = md_srf['Event.date'].str[:7].str.replace('-', '')
md_srf['Event.date.YYYYMM01'] = md_srf['Event.date'].str[:7].str.replace('-', '')+'01'
#print(md_srf[['Event.date', 'Event.date.YYYYMM', 'Event.date.YYYYMM01']])

sat_data_path = '../../01_data/00_satellite_data'
satellite_features = [
    'CHL.chlor_a', 'FLH.nflh', 'KD.Kd_490', 'PAR.par', 'PIC.pic', 'POC.poc',
    'RRS.Rrs_412', 'RRS.Rrs_443', 'RRS.Rrs_469', 'RRS.Rrs_488', 'RRS.Rrs_531',
    'RRS.Rrs_547', 'RRS.Rrs_555', 'RRS.Rrs_645', 'RRS.Rrs_667', 'RRS.Rrs_678', 'SST.sst'
]

satellite_data_terra = pd.DataFrame(index=md_srf.index, columns=satellite_features)
satellite_data_aqua = pd.DataFrame(index=md_srf.index, columns=satellite_features)

def find_satellite_file(directory, pattern):
    regex = re.compile(pattern)
    for file in os.listdir(directory):
        if regex.match(file):
            return os.path.join(directory, file)
    return None

def select_nearest_valid(ds, feature, latitude, longitude):
    data_point = ds[feature].sel(lat=latitude, lon=longitude, method='nearest')
    if not np.isnan(data_point.values):
        return data_point.values
    
    latitudes = ds['lat'].values
    longitudes = ds['lon'].values
    
    lat_grid, lon_grid = np.meshgrid(latitudes, longitudes, indexing='ij')
    distances = np.sqrt((lat_grid - latitude)**2 + (lon_grid - longitude)**2)
    
    sorted_indices = np.unravel_index(np.argsort(distances, axis=None), distances.shape)

    for i in range(len(sorted_indices[0])):
        lat_idx = sorted_indices[0][i]
        lon_idx = sorted_indices[1][i]
        data_point = ds[feature].isel(lat=lat_idx, lon=lon_idx)
        if not np.isnan(data_point.values):
            return data_point.values
    
    return np.nan

start_time = time.time() # time start

for index, row in md_srf.iterrows():
    latitude = row['Latitude']
    longitude = row['Longitude']
    date = row['Event.date.YYYYMM']
    
    # Read respective sat data
    for feature in satellite_features:
        resolution = '9km'
        
        pattern_terra = rf"TERRA_MODIS\.{date}01_{date}\d{{2}}\.L3m\.MO\.{feature}\.{resolution}\.nc"
        file_path_terra = find_satellite_file(sat_data_path, pattern_terra)
        pattern_aqua = rf"AQUA_MODIS\.{date}01_{date}\d{{2}}\.L3m\.MO\.{feature}\.{resolution}\.nc"
        file_path_aqua = find_satellite_file(sat_data_path, pattern_aqua)
        
        if file_path_terra and file_path_aqua:
            ds_terra = xr.open_dataset(file_path_terra)
            ds_aqua = xr.open_dataset(file_path_aqua)
            # select nearest non value for latitud and longitud
            try:
                variable_name = feature.split('.')[1]
                
                data_point_terra = select_nearest_valid(ds_terra, variable_name, latitude, longitude)
                satellite_data_terra.at[index, feature] = data_point_terra

                data_point_aqua = select_nearest_valid(ds_aqua, variable_name, latitude, longitude)
                satellite_data_aqua.at[index, feature] = data_point_aqua

            except KeyError:
                print(f"Feature {feature} not found in dataset")
        else:
            if not file_path_terra:
                print(f"No TERRA file found for pattern: {pattern_terra}")
            if not file_path_aqua:
                print(f"No AQUA file found for pattern: {pattern_aqua}")
                
# avg between terra and aqua
satellite_data_avg = (satellite_data_terra.astype(float) + satellite_data_aqua.astype(float)) / 2

end_time = time.time() # end time
execution_time = end_time - start_time # total time

print("satellite nan values")
print(satellite_data_avg.isna().sum())
#print("TERRA Data:")
#print(satellite_data_terra)
#print("\nAQUA Data:")
#print(satellite_data_aqua)
#print("\nAverage Data:")
#print(satellite_data_avg)

print(f"Tiempo de ejecución: {execution_time:.2f} segundos")

satellite_data_avg

satellite nan values
CHL.chlor_a    0
FLH.nflh       0
KD.Kd_490      0
PAR.par        0
PIC.pic        0
POC.poc        0
RRS.Rrs_412    0
RRS.Rrs_443    0
RRS.Rrs_469    0
RRS.Rrs_488    0
RRS.Rrs_531    0
RRS.Rrs_547    0
RRS.Rrs_555    0
RRS.Rrs_645    0
RRS.Rrs_667    0
RRS.Rrs_678    0
SST.sst        0
dtype: int64
Tiempo de ejecución: 414.64 segundos


Unnamed: 0_level_0,CHL.chlor_a,FLH.nflh,KD.Kd_490,PAR.par,PIC.pic,POC.poc,RRS.Rrs_412,RRS.Rrs_443,RRS.Rrs_469,RRS.Rrs_488,RRS.Rrs_531,RRS.Rrs_547,RRS.Rrs_555,RRS.Rrs_645,RRS.Rrs_667,RRS.Rrs_678,SST.sst
TSC_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
TSC001,0.320456,0.072980,0.0514,43.424000,0.000041,83.399902,0.004447,0.004494,0.004600,0.004193,0.002313,0.001832,0.001606,0.000090,0.000107,0.000140,22.905000
TSC003,0.138729,0.050837,0.0330,39.139997,0.000011,43.199951,0.007104,0.006388,0.005885,0.004918,0.002085,0.001565,0.001351,0.000129,0.000153,0.000172,25.375000
TSC005,0.078143,0.047820,0.0279,40.142998,0.000017,34.299805,0.010119,0.008799,0.007673,0.006127,0.002323,0.001727,0.001472,0.000140,0.000173,0.000187,26.132499
TSC008,0.167823,0.056745,0.0301,24.141998,0.000011,42.699951,0.005551,0.005221,0.005035,0.004234,0.001719,0.001267,0.001112,0.000004,0.000059,0.000090,19.942499
TSC013,0.292031,0.074510,0.0489,16.330997,0.000010,78.599854,0.003375,0.003373,0.003564,0.003343,0.001850,0.001439,0.001303,0.000048,0.000154,0.000186,16.544999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSC276,0.500157,0.088323,0.0707,7.869997,0.000349,118.000000,0.005146,0.004813,0.004952,0.004923,0.003461,0.002879,0.002622,0.000250,0.000449,0.000459,0.280000
TSC280,0.167320,0.092020,0.0584,9.130997,0.000528,70.599854,0.011098,0.008550,0.007384,0.006434,0.004046,0.003200,0.002854,0.000332,0.000547,0.000549,-0.065000
TSC281,0.574612,0.082415,0.0771,12.114998,0.000211,140.899902,0.004376,0.003948,0.004175,0.004273,0.003180,0.002669,0.002387,0.000227,0.000376,0.000390,2.592500
TSC282,0.797660,0.284248,0.0974,35.974998,0.000120,144.099854,0.004052,0.003396,0.003284,0.003242,0.002739,0.002456,0.002253,0.000304,0.000335,0.000467,9.862500


# Test n nearest

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import os
import netCDF4 as nc
import xarray as xr
import re
from glob import glob
import time

file_path = '../../01_data/01_biological_data'
file_name = 'metadata.tsv'
sat_data_path = '../../01_data/00_satellite_data'

file = os.path.join(file_path, file_name)
md = pd.read_csv(file, sep='\t', index_col=0)

md_srf = md[md.Layer == 'SRF'].copy()
md_srf['Event.date.YYYYMM'] = md_srf['Event.date'].str[:7].str.replace('-', '')
md_srf['Event.date.YYYYMM01'] = md_srf['Event.date'].str[:7].str.replace('-', '')+'01'

satellite_features = [
    'CHL.chlor_a', 'FLH.nflh', 'KD.Kd_490', 'PAR.par', 'PIC.pic', 'POC.poc',
    'RRS.Rrs_412', 'RRS.Rrs_443', 'RRS.Rrs_469', 'RRS.Rrs_488', 'RRS.Rrs_531',
    'RRS.Rrs_547', 'RRS.Rrs_555', 'RRS.Rrs_645', 'RRS.Rrs_667', 'RRS.Rrs_678', 'SST.sst'
]

satellite_data_terra = pd.DataFrame(index=md_srf.index, columns=satellite_features)
satellite_data_aqua = pd.DataFrame(index=md_srf.index, columns=satellite_features)

def find_satellite_file(directory, pattern):
    regex = re.compile(pattern)
    for file in os.listdir(directory):
        if regex.match(file):
            return os.path.join(directory, file)
    return None

def select_n_nearest_valid(ds, feature, latitude, longitude, n=5):
    lat_min, lat_max = latitude - 2, latitude + 2
    lon_min, lon_max = longitude - 2, longitude + 2
    
    latitudes = ds['lat'].values
    longitudes = ds['lon'].values
    
    lat_mask = (latitudes >= lat_min) & (latitudes <= lat_max)
    lon_mask = (longitudes >= lon_min) & (longitudes <= lon_max)

    if not np.any(lat_mask) or not np.any(lon_mask):
        return np.nan
        
    lat_subset = latitudes[lat_mask]
    lon_subset = longitudes[lon_mask]
    
    lat_grid, lon_grid = np.meshgrid(lat_subset, lon_subset, indexing='ij')
    distances = np.sqrt((lat_grid - latitude)**2 + (lon_grid - longitude)**2)
    
    sorted_indices = np.unravel_index(np.argsort(distances, axis=None), distances.shape)
    
    valid_points = []
    for i in range(len(sorted_indices[0])):
        lat_idx = sorted_indices[0][i]
        lon_idx = sorted_indices[1][i]
        data_point = ds[feature].isel(lat=lat_idx, lon=lon_idx)
        if not np.isnan(data_point.values):
            valid_points.append(data_point.values)
        if len(valid_points) >= n:
            break
    
    if len(valid_points) > 0:
        return np.mean(valid_points)
    else:
        return np.nan

start_time = time.time()

for index, row in md_srf.iterrows():
    latitude = row['Latitude']
    longitude = row['Longitude']
    date = row['Event.date.YYYYMM']
    
    for feature in satellite_features:
        resolution = '9km'
        
        pattern_terra = rf"TERRA_MODIS\.{date}01_{date}\d{{2}}\.L3m\.MO\.{feature}\.{resolution}\.nc"
        file_path_terra = find_satellite_file(sat_data_path, pattern_terra)
        pattern_aqua = rf"AQUA_MODIS\.{date}01_{date}\d{{2}}\.L3m\.MO\.{feature}\.{resolution}\.nc"
        file_path_aqua = find_satellite_file(sat_data_path, pattern_aqua)
        
        if file_path_terra and file_path_aqua:
            ds_terra = xr.open_dataset(file_path_terra)
            ds_aqua = xr.open_dataset(file_path_aqua)
            try:
                variable_name = feature.split('.')[1]
                
                data_point_terra = select_n_nearest_valid(ds_terra, variable_name, latitude, longitude, n=1)
                satellite_data_terra.at[index, feature] = data_point_terra

                data_point_aqua = select_n_nearest_valid(ds_aqua, variable_name, latitude, longitude, n=1)
                satellite_data_aqua.at[index, feature] = data_point_aqua

            except KeyError:
                print(f"Feature {feature} not found in dataset")
        else:
            if not file_path_terra:
                print(f"No TERRA file found for pattern: {pattern_terra}")
            if not file_path_aqua:
                print(f"No AQUA file found for pattern: {pattern_aqua}")

satellite_data_avg = (satellite_data_terra.astype(float) + satellite_data_aqua.astype(float)) / 2

end_time = time.time()
execution_time = end_time - start_time

print("satellite nan values")
print(satellite_data_avg.isna().sum())

print(f"Tiempo de ejecución: {execution_time:.2f} segundos")

satellite_data_avg

In [27]:
filename = 'TERRA_MODIS.20130601_20130630.L3m.MO.CHL.chlor_a.9km.nc'
file_path_terra = find_satellite_file(sat_data_path, pattern_terra)
sat_file = os.path.join(sat_data_path, filename)

ds = xr.open_dataset(sat_file)

In [33]:
latitudes = ds['lat'].values
longitudes = ds['lon'].values

np.meshgrid(latitudes, longitudes, indexing='ij')

[array([[ 89.958336,  89.958336,  89.958336, ...,  89.958336,  89.958336,
          89.958336],
        [ 89.875   ,  89.875   ,  89.875   , ...,  89.875   ,  89.875   ,
          89.875   ],
        [ 89.791664,  89.791664,  89.791664, ...,  89.791664,  89.791664,
          89.791664],
        ...,
        [-89.79167 , -89.79167 , -89.79167 , ..., -89.79167 , -89.79167 ,
         -89.79167 ],
        [-89.87501 , -89.87501 , -89.87501 , ..., -89.87501 , -89.87501 ,
         -89.87501 ],
        [-89.958336, -89.958336, -89.958336, ..., -89.958336, -89.958336,
         -89.958336]], dtype=float32),
 array([[-179.95833, -179.875  , -179.79167, ...,  179.79167,  179.87502,
          179.95834],
        [-179.95833, -179.875  , -179.79167, ...,  179.79167,  179.87502,
          179.95834],
        [-179.95833, -179.875  , -179.79167, ...,  179.79167,  179.87502,
          179.95834],
        ...,
        [-179.95833, -179.875  , -179.79167, ...,  179.79167,  179.87502,
          179.95834