Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions config.default
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ mpasMeshName = mesh
# first be divided among the tasks before applying this fraction.
autocloseFileLimitFraction = 0.5

# Large datasets can encounter a memory error. Specification of a maximum
# chunk size `maxChunkSize` can be helpful to prevent the memory error. The
# current maximum chunk size assumes approximately 64GB of ram and large files
# with a single time slice.
maxChunkSize = 10000

[output]
## options related to writing out plots, intermediate cached data sets, logs,
## etc.
Expand Down
3 changes: 2 additions & 1 deletion mpas_analysis/ocean/meridional_overturning_circulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,8 @@ def _compute_moc_climo_postprocess(config, runStreams, variableMap, calendar,
# Compute annual climatology
annualClimatology = ds.mean('Time')

# Convert to numpy arrays
# Convert to numpy arrays
# (can result in a memory error for large array size)
horizontalVel = annualClimatology.avgNormalVelocity.values
verticalVel = annualClimatology.avgVertVelocityTop.values
velArea = verticalVel * areaCell[:, np.newaxis]
Expand Down
24 changes: 17 additions & 7 deletions mpas_analysis/shared/generalized_reader/generalized_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,16 @@ def open_multifile_dataset(fileNames, calendar, config,

Author
------
Xylar Asay-Davis
Xylar Asay-Davis, Phillip J. Wolfram

Last modified
-------------
02/23/2017
03/29/2017
"""

# limit chunk size to prevent memory error
maxChunkSize = config.getint('input', 'maxChunkSize')

preprocess_partial = partial(_preprocess,
calendar=calendar,
simulationStartTime=simulationStartTime,
Expand All @@ -124,7 +128,8 @@ def open_multifile_dataset(fileNames, calendar, config,
iselValues=iselValues,
variableMap=variableMap,
startDate=startDate,
endDate=endDate)
endDate=endDate,
maxChunkSize=maxChunkSize)

kwargs = {'decode_times': False,
'concat_dim': 'Time'}
Expand Down Expand Up @@ -179,7 +184,7 @@ def open_multifile_dataset(fileNames, calendar, config,

def _preprocess(ds, calendar, simulationStartTime, timeVariableName,
variableList, selValues, iselValues, variableMap,
startDate, endDate): # {{{
startDate, endDate, maxChunkSize): # {{{
"""
Performs variable remapping, then calls mpas_xarray.preprocess, to
perform the remainder of preprocessing.
Expand Down Expand Up @@ -242,6 +247,10 @@ def _preprocess(ds, calendar, simulationStartTime, timeVariableName,
If present, the first and last dates to be used in the data set. The
time variable is sliced to only include dates within this range.

maxChunkSize : int
Specifies the maximum chunk size to limit chunks used by dask to
prevent out of memory errors for large datasets.

Returns
-------
ds : xarray.DataSet object
Expand All @@ -250,11 +259,11 @@ def _preprocess(ds, calendar, simulationStartTime, timeVariableName,

Authors
-------
Xylar Asay-Davis
Xylar Asay-Davis, Phillip J. Wolfram

Last modified
-------------
02/17/2017
03/30/2017
"""

submap = variableMap
Expand Down Expand Up @@ -283,7 +292,8 @@ def _preprocess(ds, calendar, simulationStartTime, timeVariableName,
timeVariableName=timeVariableName,
variableList=variableList,
selValues=selValues,
iselValues=iselValues)
iselValues=iselValues,
maxChunkSize=maxChunkSize)

return ds # }}}

Expand Down
17 changes: 14 additions & 3 deletions mpas_analysis/shared/mpas_xarray/mpas_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def subset_variables(ds, variableList): # {{{


def preprocess(ds, calendar, simulationStartTime, timeVariableName,
variableList, selValues, iselValues): # {{{
variableList, selValues, iselValues, maxChunkSize=1000): # {{{
"""
Builds correct time specification for MPAS, allowing a date offset
because the time must be between 1678 and 2262 based on the xarray
Expand Down Expand Up @@ -229,6 +229,10 @@ def preprocess(ds, calendar, simulationStartTime, timeVariableName,
iselValues = {'nVertLevels': slice(0, 3),
'nCells': cellIDs}

maxChunkSize : int
Specifies the maximum chunk size to limit chunks used by dask to
prevent out of memory errors for large datasets.

Returns
-------
ds : xarray.DataSet object
Expand All @@ -242,7 +246,7 @@ def preprocess(ds, calendar, simulationStartTime, timeVariableName,

Last modified
-------------
02/17/2017
03/30/2017
"""

ds = _parse_dataset_time(ds=ds,
Expand All @@ -265,6 +269,13 @@ def preprocess(ds, calendar, simulationStartTime, timeVariableName,
if iselValues is not None:
ds = ds.isel(**iselValues)

chunks = {}
for name in ds.chunks.keys():
chunklim = np.asarray(ds.chunks[name]).max()
chunks[name] = np.minimum(maxChunkSize, chunklim)

ds = ds.chunk(chunks)

return ds # }}}


Expand Down Expand Up @@ -502,7 +513,7 @@ def _parse_dataset_time(ds, inTimeVariableName, calendar,
'simulationStartTime was not '
'supplied.'.format(inTimeVariableName))

if (string_to_datetime(referenceDate) ==
if (string_to_datetime(referenceDate) ==
string_to_datetime(simulationStartTime)):
days = timeVar.values
else:
Expand Down
4 changes: 3 additions & 1 deletion mpas_analysis/test/test_generalized_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
@pytest.mark.usefixtures("loaddatadir")
class TestGeneralizedReader(TestCase):

def setup_config(self, autocloseFileLimitFraction=0.5):
def setup_config(self, autocloseFileLimitFraction=0.5,
maxChunkSize=10000):
config = MpasAnalysisConfigParser()
config.add_section('input')
config.set('input', 'autocloseFileLimitFraction',
str(autocloseFileLimitFraction))
config.set('input', 'maxChunkSize', str(maxChunkSize))
return config

def test_variableMap(self):
Expand Down