Skip to content

Commit

Permalink
Added automatic checkpoints triggered by SIGUSR1 being sent to process
Browse files Browse the repository at this point in the history
  • Loading branch information
socratesgorilla authored and MengnanLi91 committed Feb 21, 2023
1 parent ee9e88c commit 26e65ee
Show file tree
Hide file tree
Showing 13 changed files with 417 additions and 46 deletions.
22 changes: 22 additions & 0 deletions framework/include/actions/AutoCheckpointAction.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//* This file is part of the MOOSE framework
//* https://www.mooseframework.org
//*
//* All rights reserved, see COPYRIGHT for full restrictions
//* https://github.com/idaholab/moose/blob/master/COPYRIGHT
//*
//* Licensed under LGPL 2.1, please see LICENSE for details
//* https://www.gnu.org/licenses/lgpl-2.1.html

#pragma once

#include "Action.h"

class AutoCheckpointAction : public Action
{
public:
static InputParameters validParams();

AutoCheckpointAction(const InputParameters & params);

virtual void act() override;
};
6 changes: 6 additions & 0 deletions framework/include/base/Moose.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,12 @@ static_assert(LIBMESH_DIM == 3,
*/
static constexpr std::size_t dim = LIBMESH_DIM;

/**
* Used by the signal handler to determine if we should write a checkpoint file out at any point
* during operation.
*/
extern int autosave_flag;

/**
* Set to true (the default) to print the stack trace with error and warning
* messages - false to omit it.
Expand Down
3 changes: 3 additions & 0 deletions framework/include/outputs/Checkpoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ class Checkpoint : public FileOutput
private:
void updateCheckpointFiles(CheckpointFileNames file_struct);

bool _should_output;

bool _is_autosave;
/// Max no. of output files to store
unsigned int _num_files;

Expand Down
31 changes: 31 additions & 0 deletions framework/src/actions/AutoCheckpointAction.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "AutoCheckpointAction.h"
#include "FEProblem.h"
#include "Checkpoint.h"
registerMooseAction("MooseApp", AutoCheckpointAction, "auto_checkpoint_action");

InputParameters
AutoCheckpointAction::validParams()
{
InputParameters params = Action::validParams();

return params;
}

AutoCheckpointAction::AutoCheckpointAction(const InputParameters & params) : Action(params) {}

void
AutoCheckpointAction::act()
{
if (_app.isUltimateMaster())
{
if (!_app.getOutputWarehouse().getOutputs<Checkpoint>().empty())
// if there's already a checkpoint object, we don't need to worry about creating a new
// checkpoint
return;

auto cp_params = _factory.getValidParams("Checkpoint");
cp_params.setParameters("should_output", false);
cp_params.setParameters("is_autosave", true);
_problem->addOutput("Checkpoint", "autosave", cp_params);
}
}
4 changes: 4 additions & 0 deletions framework/src/base/Moose.C
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,8 @@ addActionTypes(Syntax & syntax)
registerTask("create_problem_custom", false);
registerTask("create_problem_complete", false);

// Action for setting up the signal-based checkpoint
registerTask("auto_checkpoint_action", true);
/**************************/
/****** Dependencies ******/
/**************************/
Expand Down Expand Up @@ -332,6 +334,7 @@ addActionTypes(Syntax & syntax)
"(init_problem)"
"(add_control)"
"(check_output)"
"(auto_checkpoint_action)"
"(check_integrity)");
// clang-format on
}
Expand Down Expand Up @@ -576,6 +579,7 @@ bool _warnings_are_errors = false;
bool _deprecated_is_error = false;
bool _throw_on_error = false;
bool _throw_on_warning = false;
int autosave_flag = 0;
bool show_trace = true;
bool show_multiple = false;

Expand Down
3 changes: 2 additions & 1 deletion framework/src/base/MooseApp.C
Original file line number Diff line number Diff line change
Expand Up @@ -1595,6 +1595,8 @@ MooseApp::getCheckpointDirectories() const

// Add the directories added with Outputs/checkpoint=true input syntax
checkpoint_dirs.push_back(getOutputFileBase() + "_cp");
// Add the directories added with the autosave checkpoint input syntax
checkpoint_dirs.push_back(_output_file_base + "_autosave_cp");

// Add the directories from any existing checkpoint output objects
const auto & actions = _action_warehouse.getActionListByName("add_output");
Expand All @@ -1609,7 +1611,6 @@ MooseApp::getCheckpointDirectories() const
if (moose_object_action->getParam<std::string>("type") == "Checkpoint")
checkpoint_dirs.push_back(params.get<std::string>("file_base") + "_cp");
}

return checkpoint_dirs;
}

Expand Down
18 changes: 18 additions & 0 deletions framework/src/base/MooseInit.C
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,22 @@
#include <omp.h>
#endif

#include <unistd.h>
#include <signal.h>

void
my_sig_handler(int signum)
{
Moose::autosave_flag = signum;
return;
}

void
register_sig_handler()
{
signal(SIGUSR1, my_sig_handler);
}

MooseInit::MooseInit(int argc, char * argv[], MPI_Comm COMM_WORLD_IN)
: LibMeshInit(argc, argv, COMM_WORLD_IN)
{
Expand All @@ -35,4 +51,6 @@ MooseInit::MooseInit(int argc, char * argv[], MPI_Comm COMM_WORLD_IN)

// Make sure that any calls to the global random number generator are consistent among processes
MooseRandom::seed(0);

register_sig_handler();
}
112 changes: 70 additions & 42 deletions framework/src/outputs/Checkpoint.C
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ Checkpoint::validParams()
{
// Get the parameters from the base classes
InputParameters params = FileOutput::validParams();

// Controls whether the checkpoint will actually run. Should only ever be changed by the
// auto-checkpoint created by the signal handler, which does not write unless a signal is
// received.
params.addPrivateParam<bool>("should_output", true);
params.addPrivateParam<bool>("is_autosave", false);

params.addClassDescription("Output for MOOSE recovery checkpoint files.");

// Typical checkpoint options
Expand All @@ -47,6 +54,8 @@ Checkpoint::validParams()

Checkpoint::Checkpoint(const InputParameters & parameters)
: FileOutput(parameters),
_should_output(getParam<bool>("should_output")),
_is_autosave(getParam<bool>("is_autosave")),
_num_files(getParam<unsigned int>("num_files")),
_suffix(getParam<std::string>("suffix")),
_binary(getParam<bool>("binary")),
Expand All @@ -63,6 +72,7 @@ Checkpoint::filename()
std::ostringstream output;
output << directory() << "/" << std::setw(_padding) << std::setprecision(0) << std::setfill('0')
<< std::right << timeStep();

return output.str();
}

Expand All @@ -75,60 +85,78 @@ Checkpoint::directory() const
void
Checkpoint::output(const ExecFlagType & /*type*/)
{
// Create the output directory
std::string cp_dir = directory();
Utility::mkdir(cp_dir.c_str());
// Check if we should write the autosave checkpoint. The only time _should_output will
// be false is if this Checkpoint object is created through AutoCheckpointAction, and if it
// is false, then we check if the signal handler has set the flag for us to write it out.
if (!_should_output)
{
comm().max(Moose::autosave_flag);
_should_output = Moose::autosave_flag;
}

// Create the output filename
std::string current_file = filename();
if (_should_output)
{
// Create the output directory
std::string cp_dir = directory();
Utility::mkdir(cp_dir.c_str());

// Create the libMesh Checkpoint_IO object
MeshBase & mesh = _es_ptr->get_mesh();
CheckpointIO io(mesh, _binary);
// Create the output filename
std::string current_file = filename();

// Set libHilbert renumbering flag to false. We don't support
// N-to-M restarts regardless, and if we're *never* going to do
// N-to-M restarts then libHilbert is just unnecessary computation
// and communication.
const bool renumber = false;
// Create the libMesh Checkpoint_IO object
MeshBase & mesh = _es_ptr->get_mesh();
CheckpointIO io(mesh, _binary);

// Create checkpoint file structure
CheckpointFileNames curr_file_struct;
// Set libHilbert renumbering flag to false. We don't support
// N-to-M restarts regardless, and if we're *never* going to do
// N-to-M restarts then libHilbert is just unnecessary computation
// and communication.
const bool renumber = false;

curr_file_struct.checkpoint = current_file + getMeshFileSuffix(_binary);
curr_file_struct.system = current_file + _restartable_data_io.getESFileExtension(_binary);
curr_file_struct.restart = current_file + _restartable_data_io.getRestartableDataExt();
// Create checkpoint file structure
CheckpointFileNames curr_file_struct;

// Write the checkpoint file
io.write(curr_file_struct.checkpoint);
curr_file_struct.checkpoint = current_file + getMeshFileSuffix(_binary);
curr_file_struct.system = current_file + _restartable_data_io.getESFileExtension(_binary);
curr_file_struct.restart = current_file + _restartable_data_io.getRestartableDataExt();

// Write out the restartable mesh meta data if there is any (only on processor zero)
if (processor_id() == 0)
{
for (auto & map_pair :
libMesh::as_range(_app.getRestartableDataMapBegin(), _app.getRestartableDataMapEnd()))
{
const RestartableDataMap & meta_data = map_pair.second.first;
const std::string & suffix = map_pair.second.second;
const std::string filename(curr_file_struct.checkpoint + "/meta_data" + suffix +
_restartable_data_io.getRestartableDataExt());
// Write the checkpoint file
io.write(curr_file_struct.checkpoint);

curr_file_struct.restart_meta_data.emplace(filename);
_restartable_data_io.writeRestartableData(filename, meta_data);
// Write out the restartable mesh meta data if there is any (only on processor zero)
if (processor_id() == 0)
{
for (auto & map_pair :
libMesh::as_range(_app.getRestartableDataMapBegin(), _app.getRestartableDataMapEnd()))
{
const RestartableDataMap & meta_data = map_pair.second.first;
const std::string & suffix = map_pair.second.second;
const std::string filename(curr_file_struct.checkpoint + "/meta_data" + suffix +
_restartable_data_io.getRestartableDataExt());
curr_file_struct.restart_meta_data.emplace(filename);
_restartable_data_io.writeRestartableData(filename, meta_data);
}
}
}

// Write the system data, using ENCODE vs WRITE based on ascii vs binary format
_es_ptr->write(curr_file_struct.system,
EquationSystems::WRITE_DATA | EquationSystems::WRITE_ADDITIONAL_DATA |
EquationSystems::WRITE_PARALLEL_FILES,
renumber);
// Write the system data, using ENCODE vs WRITE based on ascii vs binary format
_es_ptr->write(curr_file_struct.system,
EquationSystems::WRITE_DATA | EquationSystems::WRITE_ADDITIONAL_DATA |
EquationSystems::WRITE_PARALLEL_FILES,
renumber);

// Write the restartable data
_restartable_data_io.writeRestartableDataPerProc(curr_file_struct.restart, _restartable_data);

// Write the restartable data
_restartable_data_io.writeRestartableDataPerProc(curr_file_struct.restart, _restartable_data);
// Remove old checkpoint files
updateCheckpointFiles(curr_file_struct);

// Remove old checkpoint files
updateCheckpointFiles(curr_file_struct);
// Stop outputting the checkpoint if this is a signaled/autosave checkpoint
if (Moose::autosave_flag and _is_autosave)
{
_should_output = false;
Moose::autosave_flag = 0;
}
}
}

void
Expand Down
10 changes: 7 additions & 3 deletions modules/doc/content/application_usage/restart_recover.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
- +Restart+: Running a simulation that uses data from a previous simulation. Data in this context is very broad, it can mean spatial field data, non-spatial variables or postprocessors, or stateful object data. Usually the previous and new simulations use different input files.
- +Recover+: Resuming an existing simulation either due to a fault or other premature termination.
- +Solution File+: A mesh format containing field data in addition to the mesh (i.e. a normal output file).
- +Checkpoint+: A snapshot of the simulation data including all meshes, solutions, and stateful object data. Typically one checkpoint is stored in several different files.
- +Checkpoint+: A snapshot of the simulation data including all meshes, solutions, and stateful object data. Typically one checkpoint is stored in several different files. A checkpoint is always created at every test, but isn't written to by default.
- +N to N+: In a restart context, this means the number of processors for the previous and current simulations must match.
- +N to M+: In a restart context, different numbers of processors may be used for the previous and current simulations.

Expand Down Expand Up @@ -45,7 +45,7 @@
## Enabling Checkpoints

- Advanced restart in MOOSE requires checkpoint files.
- To enable automatic checkpoints using the default options (every time step, and keep last two) in your simulation simply add the following flag to your input file:
- To enable constant checkpoint writing using the default options (every time step, and keep last two) in your simulation simply add the following flag to your input file:

```puppet
[Outputs]
Expand All @@ -67,6 +67,10 @@ For a complete list see the Doxygen page for Checkpoint. * You should always set
[]
```

MOOSE also automatically creates a checkpoint object in the background that can manually write out a checkpoint file at any time in case of emergency, i.e. a long test that must be aborted due to external circumstances. To do this, find the process ID by running `ps` in another terminal window, and searching for your currently running MOOSE instance. Once you have located this PID, type `kill -s USR1 <yourPIDhere>` into the same window that you ran `ps`. On the next time step, MOOSE will output its current progress into a checkpoint file that can be used later to restart the test from the same position.

Note that while this command is called `kill`, it does not actually terminate the MOOSE process if used with this syntax, it will merely trigger the MOOSE instance to write out to a checkpoint.

## Advanced Restart

- This method is best suited for situations when the mesh from the previous simulation and the current simulation match but all variables should be reloaded and all stateful data should be restored.
Expand Down Expand Up @@ -106,4 +110,4 @@ More information about MultiApp restart/recover can be found at [MultiApps](synt

`start_time` can be continued on from the previous simulation, or can be overridden on restart. If this parameter is omitted from your input file
the default will be to continue from the previous simulation. If you supply the parameter in your input file, the new simulation will begin from
the supplied time.
the supplied time.

0 comments on commit 26e65ee

Please sign in to comment.