In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import os
import struct # For reading binary record markers

In [2]:
def parse_grads_ctl(ctl_file_path):
    """
    Parses a GrADS .ctl file to extract metadata for dimensions and variables.

    Args:
        ctl_file_path (str): The path to the GrADS .ctl file.

    Returns:
        dict: A dictionary containing parsed metadata like 'dset', 'undef',
              'xdef', 'ydef', 'tdef', 'zdef', 'num_vars', and 'variables'.
    """
    metadata = {}
    original_lines = []
    with open(ctl_file_path, 'r') as f:
        original_lines = f.readlines()

    lines_processed = [line.strip().lower() for line in original_lines]
    
    vars_start_index = -1

    for i, line_content in enumerate(lines_processed):
        if line_content.startswith("dset"):
            metadata['dset'] = original_lines[i].strip().split()[1].strip('^')
        elif line_content.startswith("undef"):
            metadata['undef'] = float(original_lines[i].strip().split()[1])
        elif line_content.startswith("title"):
            metadata['title'] = original_lines[i].strip().split(' ', 1)[1]
        elif line_content.startswith("options"):
            metadata['byte_order'] = line_content.split()[1]
        elif line_content.startswith("xdef"):
            parts = original_lines[i].strip().split()
            metadata['xdef'] = {'count': int(parts[1]), 'type': parts[2], 'start': float(parts[3]), 'increment': float(parts[4])}
        elif line_content.startswith("ydef"):
            parts = original_lines[i].strip().split()
            metadata['ydef'] = {'count': int(parts[1]), 'type': parts[2], 'start': float(parts[3]), 'increment': float(parts[4])}
        elif line_content.startswith("tdef"):
            parts = original_lines[i].strip().split()
            metadata['tdef'] = {'count': int(parts[1]), 'type': parts[2], 'start_str': parts[3], 'increment_str': parts[4]}
        elif line_content.startswith("zdef"):
            parts = original_lines[i].strip().split()
            metadata['zdef'] = {'count': int(parts[1]), 'type': parts[2], 'start': float(parts[3]), 'increment': float(parts[4])}
        elif line_content.startswith("vars"):
            metadata['num_vars'] = int(line_content.split()[1])
            metadata['variables'] = []
            vars_start_index = i

    if vars_start_index != -1:
        for j in range(vars_start_index + 1, len(original_lines)):
            sub_line = original_lines[j].strip()
            if sub_line.lower() == "endvars":
                break
            
            parts = sub_line.split()
            if parts:
                var_name = parts[0]
                description = ''
                if '**' in parts:
                    desc_start_idx = parts.index('**') + 1
                    description = " ".join(parts[desc_start_idx:])
                else:
                    if len(parts) > 2:
                        description = " ".join(parts[2:]) 
                
                metadata['variables'].append({'name': var_name, 'description': description.strip()})
    
    return metadata

In [3]:
def convert_grads_to_netcdf(ctl_file_path, output_nc_file):
    """
    Converts a GrADS binary data file (.dat) associated with a .ctl file
    into a NetCDF (.nc) file using xarray. This version assumes the data
    is a raw binary stream of floats, potentially with a leading header.

    Args:
        ctl_file_path (str): The path to the GrADS .ctl control file.
        output_nc_file (str): The desired path for the output NetCDF file.
    """
    metadata = parse_grads_ctl(ctl_file_path)

    data_filename_from_ctl = metadata['dset']
    data_file_dir = os.path.dirname(ctl_file_path)
    data_file_path = os.path.join(data_file_dir, data_filename_from_ctl)

    # 1. Infer spatial dimensions and coordinates
    x_coords = np.arange(metadata['xdef']['start'],
                         metadata['xdef']['start'] + metadata['xdef']['count'] * metadata['xdef']['increment'],
                         metadata['xdef']['increment'])
    y_coords = np.arange(metadata['ydef']['start'],
                         metadata['ydef']['start'] + metadata['ydef']['count'] * metadata['ydef']['increment'],
                         metadata['ydef']['increment'])
    z_coords = np.arange(metadata['zdef']['start'],
                         metadata['zdef']['start'] + metadata['zdef']['count'] * metadata['zdef']['increment'],
                         metadata['zdef']['increment'])

    # 2. Time parsing: Convert GrADS time format to pandas DatetimeIndex
    try:
        start_date = pd.to_datetime(metadata['tdef']['start_str'], format='%HZ%d%b%Y')
    except ValueError:
        print(f"Warning: Could not parse start date '{metadata['tdef']['start_str']}' with '%HZ%d%b%Y'. Attempting generic parse.")
        start_date = pd.to_datetime(metadata['tdef']['start_str']) 

    freq_map = {'1yr': 'YS', '1mo': 'MS', '1dy': 'D', '1hr': 'H', 
                '1mn': 'min', '1sc': 'S'}
    time_freq = metadata['tdef']['increment_str'].lower()
    if time_freq in freq_map:
        time_freq = freq_map[time_freq]
    else:
        print(f"Warning: Could not directly map GrADS time increment '{metadata['tdef']['increment_str']}' to pandas frequency. Trying simple replacement.")
        if 'yr' in time_freq: time_freq = time_freq.replace('yr', 'Y')
        elif 'mo' in time_freq: time_freq = time_freq.replace('mo', 'M')
        elif 'dy' in time_freq: time_freq = time_freq.replace('dy', 'D')
        elif 'hr' in time_freq: time_freq = time_freq.replace('hr', 'H')
        elif 'mn' in time_freq: time_freq = time_freq.replace('mn', 'min')
        elif 'sc' in time_freq: time_freq = time_freq.replace('sc', 'S')
        else:
            print(f"Error: Unable to determine pandas frequency from '{metadata['tdef']['increment_str']}'. Defaulting to 'D'. This may cause incorrect time coordinates.")
            time_freq = 'D'

    time_coords = pd.date_range(start=start_date, periods=metadata['tdef']['count'], freq=time_freq)

    # 3. Determine data type and byte order
    base_dtype = np.float32 
    endian = '<' if metadata.get('byte_order', 'little_endian').lower() == 'little_endian' else '>'
    data_dtype = np.dtype(f"{endian}f{base_dtype().itemsize}")

    # 4. Calculate total expected number of values (elements) from CTL
    total_expected_elements = (metadata['tdef']['count'] *
                               metadata['zdef']['count'] *
                               metadata['num_vars'] *
                               metadata['ydef']['count'] *
                               metadata['xdef']['count'])
    
    expected_data_bytes = total_expected_elements * base_dtype().itemsize

    try:
        actual_file_size_bytes = os.path.getsize(data_file_path)
        
        # Calculate potential header/footer bytes
        excess_bytes = actual_file_size_bytes - expected_data_bytes

        if excess_bytes < 0:
            raise ValueError(f"Error: Data file '{data_file_path}' is smaller than expected. "
                             f"Expected {expected_data_bytes} bytes but found {actual_file_size_bytes} bytes. Conversion aborted.")
        elif excess_bytes > 0:
            print(f"Warning: Data file '{data_file_path}' contains {excess_bytes} excess bytes.")
            print("Assuming these excess bytes are at the *beginning* of the file (a header) and will be skipped.")
            bytes_to_skip_at_start = excess_bytes
        else:
            bytes_to_skip_at_start = 0
            print(f"File size matches expected data size: {actual_file_size_bytes} bytes.")


        # Read the binary data: Open the file and skip potential header, then read data
        with open(data_file_path, 'rb') as f:
            if bytes_to_skip_at_start > 0:
                f.seek(bytes_to_skip_at_start)
                print(f"Skipped {bytes_to_skip_at_start} bytes at the beginning of the file.")
            
            raw_data = np.fromfile(f, dtype=data_dtype, count=total_expected_elements)
            
            # Print a sample of raw data for diagnostic purposes
            print(f"Sample of raw data (first 20 elements): {raw_data[:20]}")

            # Verify that we actually read the expected number of elements
            if raw_data.size != total_expected_elements:
                 raise ValueError(f"Failed to read expected number of elements after skipping header. "
                                  f"Expected {total_expected_elements}, but read {raw_data.size}.")


        # Reshape the raw data.
        # GrADS binary is commonly X fastest, then Y, then Z, then Variable, then Time.
        # This translates to Fortran 'column-major' order for (Time, Z, Variables, Lat, Lon)
        # HOWEVER, the 'repeated grid' symptom often means the data is actually C-order (row-major).
        # We are now trying 'C' order.
        data_shape = (
            metadata['tdef']['count'],
            metadata['zdef']['count'],
            metadata['num_vars'], 
            metadata['ydef']['count'],
            metadata['xdef']['count']
        )
        reshaped_data = raw_data.reshape(data_shape, order='C') # Changed from 'F' to 'C'

        # 5. Create DataArrays for each variable and populate the Dataset
        data_vars = {}
        for i, var_info in enumerate(metadata['variables']):
            var_data_array = reshaped_data[:, :, i, :, :]
            var_data_array[var_data_array == metadata['undef']] = np.nan
            squeezed_data = var_data_array.squeeze()
            
            dims = []
            if metadata['tdef']['count'] > 1: dims.append('time')
            if metadata['zdef']['count'] > 1: dims.append('z')
            dims.append('lat')
            dims.append('lon')

            current_data_array_coords = {}
            if 'time' in dims:
                current_data_array_coords['time'] = time_coords
            if 'z' in dims:
                current_data_array_coords['z'] = z_coords
            current_data_array_coords['lat'] = y_coords
            current_data_array_coords['lon'] = x_coords

            data_vars[var_info['name']] = xr.DataArray(
                squeezed_data,
                coords=current_data_array_coords,
                dims=dims,
                name=var_info['name'],
                attrs={'long_name': var_info['description'], 'units': 'kg/m^2/s'}
            )

        # 6. Create the xarray Dataset
        ds = xr.Dataset(
            data_vars=data_vars,
            coords={
                'time': time_coords,
                'z': z_coords,
                'lat': y_coords,
                'lon': x_coords
            },
            attrs={'title': metadata.get('title', 'Converted from GrADS binary')}
        )

        # Ensure the output directory exists
        output_dir = os.path.dirname(output_nc_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # 7. Save to NetCDF
        ds.to_netcdf(output_nc_file)
        print(f"Successfully converted '{ctl_file_path}' to '{output_nc_file}'")

    except FileNotFoundError:
        print(f"Error: Data file '{data_file_path}' not found. Please ensure the .dat file exists and the `dset` path in your .ctl is correct relative to the .ctl file.")
    except Exception as e:
        print(f"An unexpected error occurred during conversion: {e}")
        import traceback
        traceback.print_exc()

In [6]:
# --- How to use ---
# IMPORTANT: If you encounter Permission Denied errors, try changing the output directory
# to a simple, non-cloud-synced location, e.g., C:/temp/converted_data/

# PAY ATTENTION TO HAVE THE OUTPUT WITH .NC 

ctl_file = "masking_test/to_convert/JunIC_nmme_precip_skill.ctl"
output_netcdf_file = "masking_test/converted/JunIC_nmme_precip_skill.nc"

In [7]:
# Call the conversion function
convert_grads_to_netcdf(ctl_file, output_netcdf_file)


File size matches expected data size: 2085120 bytes.
Sample of raw data (first 20 elements): [-9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08
 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08
 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08 -9.99e+08]
Successfully converted 'masking_test/to_convert/JunIC_nmme_precip_skill.ctl' to 'masking_test/converted/JunIC_nmme_precip_skill.nc'
