In [2]:
import netCDF4 as nc
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Load the NetCDF file
file_path = '/glade/work/wchuang/mlmicrophysics/cesm_output/tauREV4/cam_ml_tauREV4_train.cam.h1.2005-01-01-00000.nc'  # Replace with your NetCDF file path
dataset = nc.Dataset(file_path)
ds = xr.open_dataset(file_path)

In [4]:
ds["time"][0]

In [5]:
ds

In [11]:
# 1. Print basic information about the NetCDF file
def print_basic_info(dataset):
    print("File format:", dataset.file_format)
    print("\nGlobal Attributes:")
    for attr in dataset.ncattrs():
        print(f"{attr}: {dataset.getncattr(attr)}")
    
    print("\nDimensions:")
    for dim in dataset.dimensions.values():
        print(dim)
        
    print("\nVariables:")
    for var in dataset.variables.values():
        print(var)
        
print_basic_info(dataset)

File format: NETCDF3_64BIT_OFFSET

Global Attributes:
Conventions: CF-1.0
source: CAM
case: cam_ml_tauREV4_train
logname: andrew
host: derecho4
initial_file: /glade/campaign/cesm/cesmdata/inputdata/atm/cam/inic/fv/cami-mam3_0000-01-01_0.9x1.25_L32_c141031.nc
topography_file: /glade/campaign/cesm/cesmdata/inputdata/atm/cam/topo/fv_0.9x1.25_nc3000_Nsw042_Nrs008_Co060_Fi001_ZR_sgh30_24km_GRNL_c170103.nc
model_doi_url: not_set
time_period_freq: hour_23

Dimensions:
<class 'netCDF4._netCDF4.Dimension'>: name = 'lat', size = 192
<class 'netCDF4._netCDF4.Dimension'>: name = 'lon', size = 288
<class 'netCDF4._netCDF4.Dimension'> (unlimited): name = 'time', size = 32
<class 'netCDF4._netCDF4.Dimension'>: name = 'nbnd', size = 2
<class 'netCDF4._netCDF4.Dimension'>: name = 'chars', size = 8
<class 'netCDF4._netCDF4.Dimension'>: name = 'lev', size = 32
<class 'netCDF4._netCDF4.Dimension'>: name = 'ilev', size = 33
<class 'netCDF4._netCDF4.Dimension'>: name = 'trop_cld_lev', size = 32
<class 'netC

In [6]:
### 2. List all variables and their dimensions
def list_variables(dataset):
    for var_name, variable in dataset.variables.items():
        print(f"\nVariable: {var_name}")
        print("Dimensions:", variable.dimensions)
        print("Shape:", variable.shape)
        print("Attributes:")
        for attr in variable.ncattrs():
            print(f"  {attr}: {variable.getncattr(attr)}")
            
list_variables(dataset)


Variable: lat
Dimensions: ('lat',)
Shape: (192,)
Attributes:
  long_name: latitude
  units: degrees_north

Variable: lon
Dimensions: ('lon',)
Shape: (288,)
Attributes:
  long_name: longitude
  units: degrees_east

Variable: gw
Dimensions: ('lat',)
Shape: (192,)
Attributes:
  long_name: latitude weights

Variable: lev
Dimensions: ('lev',)
Shape: (32,)
Attributes:
  long_name: hybrid level at midpoints (1000*(A+B))
  units: hPa
  positive: down
  standard_name: atmosphere_hybrid_sigma_pressure_coordinate
  formula_terms: a: hyam b: hybm p0: P0 ps: PS

Variable: hyam
Dimensions: ('lev',)
Shape: (32,)
Attributes:
  long_name: hybrid A coefficient at layer midpoints

Variable: hybm
Dimensions: ('lev',)
Shape: (32,)
Attributes:
  long_name: hybrid B coefficient at layer midpoints

Variable: P0
Dimensions: ()
Shape: ()
Attributes:
  long_name: reference pressure
  units: Pa

Variable: ilev
Dimensions: ('ilev',)
Shape: (33,)
Attributes:
  long_name: hybrid level at interfaces (1000*(A+B))
  u

In [7]:
ds2 = ds.drop_vars(['P0', 'ndbase', 'nsbase', 'nbdate', 'nbsec', 'mdt'])

In [8]:
filter_mask = (ds2.data_vars['CLOUD'] > 1e-2) & (ds2.data_vars['QC_TAU_in'] > 1e-6)

In [None]:
sum(sum(sum(sum(filter_mask))))

In [40]:
sum(sum(sum(sum(cloud_array))))

In [26]:
# 3. Summary statistics for each variable
def summarize_variables(ds):
    print("\nSummary Statistics:")
    for var in ds.data_vars:
        print(f"\nVariable: {var}")
        print(ds[var].to_series().describe())

summarize_variables(ds2)


Summary Statistics:

Variable: gw
count    192.000000
mean       0.010417
std        0.005118
min        0.000034
25%        0.006263
50%        0.011582
75%        0.015156
max        0.016447
Name: gw, dtype: float64

Variable: hyam
count    32.000000
mean      0.071518
std       0.058274
min       0.000000
25%       0.020759
50%       0.056599
75%       0.121676
max       0.178231
Name: hyam, dtype: float64

Variable: hybm
count    32.000000
mean      0.326108
std       0.389771
min       0.000000
25%       0.000000
50%       0.087696
75%       0.730823
max       0.992556
Name: hybm, dtype: float64

Variable: hyai
count    33.000000
mean      0.069328
std       0.059041
min       0.000000
25%       0.018555
50%       0.056240
75%       0.113876
max       0.181863
Name: hyai, dtype: float64

Variable: hybi
count    33.000000
mean      0.331378
std       0.395544
min       0.000000
25%       0.000000
50%       0.085654
75%       0.753628
max       1.000000
Name: hybi, dtype: float64


In [None]:
# 4. Basic plot of a variable (replace 'variable_name' with an actual variable name from your dataset)
def plot_variable(ds, var_name):
    if var_name not in ds.variables:
        print(f"{var_name} not found in dataset.")
        return

    data = ds[var_name]
    if data.ndim == 2:  # 2D plot for simple spatial data
        data.plot()
        plt.title(f"{var_name} (2D Plot)")
    elif data.ndim == 3:  # 3D plot for time-series spatial data
        time_slice = data.isel(time=0)  # Select the first time slice
        time_slice.plot()
        plt.title(f"{var_name} at time=0")
    else:
        print(f"Cannot plot variable '{var_name}' with dimensions {data.ndim}")
    plt.show()

plot_variable(ds, 'CLDLIQ')  # Replace 'variable_name' with an actual variable name

In [None]:
# 5. Close the dataset when done
dataset.close()
ds.close()