In [None]:
import netCDF4 as nc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
import pickle
# import cartopy.crs as ccrs
import geopandas as gpd
from datetime import timezone
from dateutil import parser
import matplotlib.colors as mcolors


In [None]:

def print_netcdf_summary(file_path, return_variables=None):

    print(return_variables)
    """Prints a summary and optionally returns data for specific variables from a NetCDF file."""
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' does not exist.")
        return

    try:
        dataset = nc.Dataset(file_path, mode='r')
    except Exception as e:
        print(f"Error opening file: {e}")
        return

    variable_data = {}

    print("Global Attributes:")
    for attr in dataset.ncattrs():
        print(f"  {attr}: {dataset.getncattr(attr)}")

    print("\nDimensions:")
    for dim, dim_obj in dataset.dimensions.items():
        print(f"  {dim}: length {dim_obj.size} (unlimited: {dim_obj.isunlimited()})")

    print("\nVariables:")
    for var in dataset.variables:
        print(f"Variable: {var}")
        print(f"  Type: {dataset.variables[var].dtype}")
        print(f"  Dimensions: {dataset.variables[var].dimensions}")
        print(f"  Shape: {dataset.variables[var].shape}")
        for attr in dataset.variables[var].ncattrs():
            print(f"    {attr}: {dataset.variables[var].getncattr(attr)}")
        if return_variables and var in return_variables:
            variable_data[var] = dataset.variables[var][:]

    if 'time' in dataset.variables:
        T_var = dataset.variables['time']
        T_units = T_var.units if 'units' in T_var.ncattrs() else 'No units available'
        T_data = T_var[:]
        print(f"\nT units: {T_units} Data: {T_data}")

    dataset.close()

    if return_variables:
        print('retorna!')
        return variable_data


In [None]:

def read_station(folder_path, file_name):
    
    full_path = os.path.join(folder_path, f'{file_name}.txt')
    try:
        # Load observation data from a text file
        # Specify -99999 and -1 as NaN values
        df = pd.read_table(full_path, delim_whitespace=True, na_values=[-99999, -1])
        df['date'] = pd.to_datetime(df['date'])
        bin_data = 0

    except FileNotFoundError:
        print(f"No data: {full_path}")
        bin_data = 1
        df = pd.DataFrame()  # Return empty DataFrame if file is not found
    
    return df, bin_data


In [None]:


def list_files_and_dates(directory_path):
    """Checks if the directory exists and returns a DataFrame containing filenames, their issue dates, and forecast datetimes."""
    if os.path.exists(directory_path):
        # Initialize lists to hold file names, issue dates, and forecast datetimes
        forecasts = []  # Initialize the list to store directory names
        if os.path.exists(directory_path):
            # List all entries in the given folder
            for entry in os.listdir(directory_path):
                # Construct the full path of the entry
                full_path = os.path.join(directory_path, entry)
                # Check if this entry is a directory
                if os.path.isdir(full_path):
                    forecasts.append(entry)  # Add the directory name to the list
        else:
            print(f"The directory '{directory_path}' does not exist.")


        file_dates = [pd.to_datetime(dates, format='%Y%m%d%H%M') for dates in forecasts]


        file_data = pd.DataFrame({
            'File Name': forecasts,
            'Date': file_dates
        }).sort_values(by='Date').reset_index(drop=True)
        return file_data
    else:
        print(f"The directory '{directory_path}' does not exist.")
        return None
    
    

In [None]:
def process_nasa_rapid_MOGREPS(directory_path, file_data, river_id):
    forecast_dict = {}  # Use a dictionary to store the data by issue_date

    for forecast in file_data['File Name']:
        issue_date = pd.to_datetime(forecast, format='%Y%m%d%H').replace(tzinfo=timezone.utc)
        full_path = os.path.join(directory_path, forecast)

        if not os.path.exists(full_path):
            print("The specified path does not exist.")
            continue

        nc_files = [f for f in os.listdir(full_path) if f.endswith('.nc')]
        file_dates = [parser.parse(f.split('_')[2].split('.')[0]).replace(tzinfo=timezone.utc) for f in nc_files]
        
        file_nc_data = pd.DataFrame({
            'File Name': nc_files,
            'Date': file_dates
        }).sort_values(by='Date').reset_index(drop=True)

        streamflows = []
        for filename, date_nc in file_nc_data.itertuples(index=False):
            full_path_file = os.path.join(full_path, filename)
            try:
                with nc.Dataset(full_path_file, mode='r') as dataset:
                    river_ids = dataset.variables['rivid'][:]
                    if river_id in river_ids:
                        river_index = np.where(river_ids == river_id)[0]
                        if river_index.size > 0:
                            qout_data = dataset.variables['Qout'][river_index, :][0, 0]
                            streamflows.append(qout_data)
                    else:
                        print(f"River ID {river_id} not found in {filename}.")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

        if not streamflows:
            continue
        
        df_for = pd.DataFrame({'dis24_station': streamflows}, index=file_nc_data['Date'])
        df_for_daily = df_for.resample('D').mean()
        df_for_daily.index += pd.Timedelta(hours=12)

        forecast_dict[issue_date] = {
            'time': df_for_daily.index,
            'dis24_station': df_for_daily['dis24_station'],
            'start_date': df_for_daily.index.min()
        }

    return forecast_dict

In [None]:
def process_nasa_hymap_MOGREPS(directory_path, file_data, n_s, e_w):
    forecast_dict = {}  # Use a dictionary to store the data by issue_date

    for forecast in file_data['File Name']:
        issue_date = pd.to_datetime(forecast, format='%Y%m%d%H').replace(tzinfo=timezone.utc)
        full_path = os.path.join(directory_path, forecast)

        if not os.path.exists(full_path):
            print("The specified path does not exist.")
            continue

        nc_files = [f for f in os.listdir(full_path) if f.endswith('.nc')]
        file_dates = [parser.parse(f.split('_')[2].split('.')[0]).replace(tzinfo=timezone.utc) for f in nc_files]
        
        file_nc_data = pd.DataFrame({
            'File Name': nc_files,
            'Date': file_dates
        }).sort_values(by='Date').reset_index(drop=True)

        # Filter rows where the 'Date' column has a time of 12:00 PM
        file_nc_data = file_nc_data[file_nc_data['Date'].dt.hour == 12]
        file_nc_data = file_nc_data.set_index('Date')


        print(file_nc_data)

        streamflow_data = []

        for filename in file_nc_data['File Name']:
            full_path_file = os.path.join(full_path, filename)
            try:
                with nc.Dataset(full_path_file, mode='r') as dataset:
                    # Extract all 18 ensemble values for the specified lat/lon coordinates
                    stream = dataset.variables['Streamflow_inst'][:, n_s, e_w]
                    streamflow_data.append(list(stream))

            except Exception as e:
                print(f"Error processing {filename}: {e}")

        
        # Convert list to NumPy array for matrix operations
        streamflow_data_np = np.array(streamflow_data)
        print(streamflow_data_np.shape)

        dis24_mean = streamflow_data_np.mean(axis=1)

        forecast_dict[issue_date] = {
            'time': file_nc_data.index,
            'dis24_ensem': streamflow_data_np,
            'dis24_station': dis24_mean,
            'start_date': file_nc_data.index.min()
        }

    return forecast_dict

In [None]:
def process_nasa_hymap_GALWEM(directory_path, file_data, n_s, e_w):
    forecast_dict = {}  # Use a dictionary to store the data by issue_date

    for forecast in file_data['File Name']:
        issue_date = pd.to_datetime(forecast, format='%Y%m%d%H').replace(tzinfo=timezone.utc)
        full_path = os.path.join(directory_path, forecast)

        if not os.path.exists(full_path):
            print("The specified path does not exist.")
            continue

        nc_files = [f for f in os.listdir(full_path) if f.endswith('.nc')]
        file_dates = [parser.parse(f.split('_')[2].split('.')[0]).replace(tzinfo=timezone.utc) for f in nc_files]
        
        file_nc_data = pd.DataFrame({
            'File Name': nc_files,
            'Date': file_dates
        }).sort_values(by='Date').reset_index(drop=True)
        
        # Filter rows where the 'Date' column has a time of 12:00 PM
        file_nc_data = file_nc_data[file_nc_data['Date'].dt.hour == 12]

        print(file_nc_data)

        file_nc_data = file_nc_data.set_index('Date')

        streamflow_data = []

        for filename in file_nc_data['File Name']:
            full_path_file = os.path.join(full_path, filename)
            try:
                with nc.Dataset(full_path_file, mode='r') as dataset:
                    # Extract all 18 ensemble values for the specified lat/lon coordinates
                    stream = dataset.variables['Streamflow_inst'][n_s, e_w]
                    streamflow_data.append(stream)

            except Exception as e:
                print(f"Error processing {filename}: {e}")

        
        # Convert list to NumPy array for matrix operations
        streamflow_data_np = np.array(streamflow_data)
        print(streamflow_data_np)


        forecast_dict[issue_date] = {
            'time': file_nc_data.index,
            'dis24_station': streamflow_data_np,
            'start_date': file_nc_data.index.min()
        }

    return forecast_dict

In [None]:
# See files from system
directory_path = 'GHI-MR-MOGREPS-G-hymap-ROUTING/'

file_data = list_files_and_dates(directory_path)

folder_issue = file_data['File Name'].loc[0]
full_path = os.path.join(directory_path, folder_issue)

if os.path.exists(full_path):
    for entry in os.listdir(full_path):
        if entry.endswith('.nc'):  # Checking if the file ends with .nc
            first_nc_file = entry  # Store the file name
            break  # Exit the loop after the first .nc file is found
else:
    print("The specified path does not exist.")

file_path = os.path.join(full_path,first_nc_file)
print_netcdf_summary(file_path)


# n_s = 640
# e_w = 907

# # Path to shapefile
# shapefile_path = '../Documents/gauge_stations_shapefile.gpkg'

# # Calculate longitude and latitude arrays
# # Entered manually based on the def print_netcdf_summary output
# sw_corner_lon = -179.9296875  # South-West corner longitude
# sw_corner_lat = -89.953125    # South-West corner latitude
# dx = 0.140625  # Longitude increment
# dy = 0.09375   # Latitude increment

# # Open the NetCDF file
# nc_data = nc.Dataset(file_path, 'r')

# # Generate longitude and latitude arrays
# longitudes = sw_corner_lon + np.arange(nc_data.dimensions['east_west'].size) * dx
# latitudes = sw_corner_lat + np.arange(nc_data.dimensions['north_south'].size) * dy

# # Define the latitude and longitude slices for Rio Grande do Sul
# lat_bounds = [-32, -26]
# lon_bounds = [-57, -49]

# # Find the indices for slicing
# lat_indices = np.where((latitudes >= lat_bounds[0]) & (latitudes <= lat_bounds[1]))[0]
# lon_indices = np.where((longitudes >= lon_bounds[0]) & (longitudes <= lon_bounds[1]))[0]

# # Read the streamflow data and slice it
# streamflow = nc_data.variables['Streamflow_tavg'][lat_indices, lon_indices]


# # Load the shapefile using Geopandas
# gdf = gpd.read_file(shapefile_path)

# # Create a plot
# fig, ax = plt.subplots(figsize=(12, 10), subplot_kw={'projection': ccrs.PlateCarree()})
# ax.set_extent([lon_bounds[0], lon_bounds[1], lat_bounds[0], lat_bounds[1]], crs=ccrs.PlateCarree())

# # Set colobar limits to better visualization
# bar_lim_sup = 2000
# bar_lim_inf = 0

# # Plot the NetCDF data
# cf = ax.pcolormesh(longitudes[lon_indices], latitudes[lat_indices], streamflow, shading='auto', cmap='plasma', vmin=bar_lim_inf, vmax=bar_lim_sup, transform=ccrs.PlateCarree())

# # Add the shapefile
# gdf.plot(ax=ax, facecolor='black', edgecolor='white', linewidth=1, marker = 'o', transform=ccrs.PlateCarree())

# # Adding features
# ax.coastlines(resolution='10m', color='black', linewidth=1)
# # ax.set_title(f"Ensemble mean {file_data['Date'].loc[file_n]} - Forecast issued in: {issue_date}")
# plt.colorbar(cf, ax=ax, label='Streamflow (m³/s)')
# plt.grid()
# plt.show()

# # Close the NetCDF file
# nc_data.close()


In [None]:
# # See files from system
# directory_path = 'GHI-MR-GALWEM-D-rapid-ROUTING/'

# file_data = list_files_and_dates(directory_path)

# folder_issue = file_data['File Name'].loc[0]
# full_path = os.path.join(directory_path, folder_issue)

# if os.path.exists(full_path):
#     for entry in os.listdir(full_path):
#         if entry.endswith('.nc'):  # Checking if the file ends with .nc
#             first_nc_file = entry  # Store the file name
#             break  # Exit the loop after the first .nc file is found
# else:
#     print("The specified path does not exist.")

# file_path = os.path.join(full_path,first_nc_file)
# print_netcdf_summary(file_path)


In [None]:
# # See files from system
# directory_path = 'GHI-MR-MOGREPS-G-hymap-ROUTING/'

# file_data = list_files_and_dates(directory_path)


# folder_issue = file_data['File Name'].loc[0]
# full_path = os.path.join(directory_path, folder_issue)

# if os.path.exists(full_path):
#     for entry in os.listdir(full_path):
#         if entry.endswith('.nc'):  # Checking if the file ends with .nc
#             first_nc_file = entry  # Store the file name
#             break  # Exit the loop after the first .nc file is found
# else:
#     print("The specified path does not exist.")

# file_path = os.path.join(full_path,first_nc_file)
# print_netcdf_summary(file_path)





In [None]:
# # See files from system
# directory_path = 'GHI-MR-MOGREPS-G-rapid-ROUTING/'

# file_data = list_files_and_dates(directory_path)


# folder_issue = file_data['File Name'].loc[0]
# full_path = os.path.join(directory_path, folder_issue)

# if os.path.exists(full_path):
#     for entry in os.listdir(full_path):
#         if entry.endswith('.nc'):  # Checking if the file ends with .nc
#             first_nc_file = entry  # Store the file name
#             break  # Exit the loop after the first .nc file is found
# else:
#     print("The specified path does not exist.")

# file_path = os.path.join(full_path,first_nc_file)
# print_netcdf_summary(file_path)


In [None]:

name_data = 'discharge'

# -----------------------------------------------------------------------------
# Read data from station
info_stations = pd.read_csv('../Documents/porto_alegre_stations_ids.csv')

# Specify the directory paths
hist_folder_path = f'Historic_{name_data}'
telem_folder_path = 'Telemetricas'

# Access the first row directly for debugging
first_row = info_stations.iloc[0]
# Your debugging code goes here using 'first_row' instead of 'row'

# Loop through each station using the 'Code' column
for index, row in info_stations.iterrows():

    print(index)

    # if index != 6:
    #     continue
    
    station_name = row['Name']
    station_code = row['Code']
    station_nasa_code = row['RAPID (rivid)']
    n_s = row['LIS_Grid(lat)']
    print(n_s)
    e_w = row['LIS_Grid(lon)']
    print(e_w)
    print('----------------------------------------------------------------------------')
    print(station_code)
    print(station_name)
    print(station_nasa_code)



    df_telem, bin_data = read_station('../Telemetry', station_code)

    if bin_data == 1:
        continue

    if directory_path == 'GHI-MR-MOGREPS-G-rapid-ROUTING/' or directory_path == 'GHI-MR-GALWEM-D-rapid-ROUTING/':
        file_data = list_files_and_dates(directory_path)
        processed_data = process_nasa_rapid_MOGREPS(directory_path, file_data, station_nasa_code)

# CHECK  BOTH HYMAP 

    elif directory_path == 'GHI-MR-MOGREPS-G-hymap-ROUTING/':
        file_data = list_files_and_dates(directory_path)
        processed_data = process_nasa_hymap_MOGREPS(directory_path, file_data, n_s, e_w)

    elif directory_path == 'GHI-MR-GALWEM-D-hymap-ROUTING/':
        file_data = list_files_and_dates(directory_path)
        processed_data = process_nasa_hymap_GALWEM(directory_path, file_data, n_s, e_w)

    # Save the processed data to a file
    with open(f'{directory_path}/data_nasa_{station_code}.pkl', 'wb') as file:
        pickle.dump(processed_data, file)
