Skip to content

Commit

Permalink
Fix Startup Crash Caused by Read() and Write() on Un-activated Etherc…
Browse files Browse the repository at this point in the history
…at (#120)

* Use at() instead of [], propagate activation failures, lock ethercat

* Convert NULL domain debug to string, add additional NULL checks
  • Loading branch information
tony-laza committed May 29, 2024
1 parent 3aaca92 commit 52be2c2
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 14 deletions.
2 changes: 2 additions & 0 deletions ethercat_driver/include/ethercat_driver/ethercat_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class EthercatDriver : public hardware_interface::SystemInterface

int control_frequency_;
ethercat_interface::EcMaster master_;
std::mutex ec_mutex_;
bool activated_;
};
} // namespace ethercat_driver

Expand Down
30 changes: 27 additions & 3 deletions ethercat_driver/src/ethercat_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ CallbackReturn EthercatDriver::on_init(
return CallbackReturn::ERROR;
}

const std::lock_guard<std::mutex> lock(ec_mutex_);
activated_ = false;

hw_joint_states_.resize(info_.joints.size());
for (uint j = 0; j < info_.joints.size(); j++) {
hw_joint_states_[j].resize(
Expand Down Expand Up @@ -259,6 +262,11 @@ EthercatDriver::export_command_interfaces()
CallbackReturn EthercatDriver::on_activate(
const rclcpp_lifecycle::State & /*previous_state*/)
{
const std::lock_guard<std::mutex> lock(ec_mutex_);
if (activated_) {
RCLCPP_FATAL(rclcpp::get_logger("EthercatDriver"), "Double on_activate()");
return CallbackReturn::ERROR;
}
RCLCPP_INFO(rclcpp::get_logger("EthercatDriver"), "Starting ...please wait...");
if (info_.hardware_parameters.find("control_frequency") == info_.hardware_parameters.end()) {
control_frequency_ = 100;
Expand Down Expand Up @@ -303,7 +311,10 @@ CallbackReturn EthercatDriver::on_activate(
}
}

master_.activate();
if (!master_.activate()) {
RCLCPP_ERROR(rclcpp::get_logger("EthercatDriver"), "Activate EcMaster failed");
return CallbackReturn::ERROR;
}
RCLCPP_INFO(rclcpp::get_logger("EthercatDriver"), "Activated EcMaster!");

// start after one second
Expand Down Expand Up @@ -339,12 +350,17 @@ CallbackReturn EthercatDriver::on_activate(
RCLCPP_INFO(
rclcpp::get_logger("EthercatDriver"), "System Successfully started!");

activated_ = true;

return CallbackReturn::SUCCESS;
}

CallbackReturn EthercatDriver::on_deactivate(
const rclcpp_lifecycle::State & /*previous_state*/)
{
const std::lock_guard<std::mutex> lock(ec_mutex_);
activated_ = false;

RCLCPP_INFO(rclcpp::get_logger("EthercatDriver"), "Stopping ...please wait...");

// stop EC and disconnect
Expand All @@ -360,15 +376,23 @@ hardware_interface::return_type EthercatDriver::read(
const rclcpp::Time & /*time*/,
const rclcpp::Duration & /*period*/)
{
master_.readData();
// try to lock so we can avoid blocking the read/write loop on the lock.
const std::unique_lock<std::mutex> lock(ec_mutex_, std::try_to_lock);
if (lock.owns_lock() && activated_) {
master_.readData();
}
return hardware_interface::return_type::OK;
}

hardware_interface::return_type EthercatDriver::write(
const rclcpp::Time & /*time*/,
const rclcpp::Duration & /*period*/)
{
master_.writeData();
// try to lock so we can avoid blocking the read/write loop on the lock.
const std::unique_lock<std::mutex> lock(ec_mutex_, std::try_to_lock);
if (lock.owns_lock() && activated_) {
master_.writeData();
}
return hardware_interface::return_type::OK;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class EcMaster
int configSlaveSdo(uint16_t slave_position, SdoConfigEntry sdo_config, uint32_t * abort_code);

/** call after adding all slaves, and before update */
void activate();
bool activate();

/** perform one EtherCAT cycle, passing the domain to the slaves */
virtual void update(uint32_t domain = 0);
Expand Down
45 changes: 35 additions & 10 deletions ethercat_interface/src/ec_master.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ EcMaster::~EcMaster()
//
}
for (auto & domain : domain_info_) {
delete domain.second;
if (domain.second != NULL) {
delete domain.second;
}
}
}

Expand Down Expand Up @@ -128,7 +130,10 @@ void EcMaster::addSlave(uint16_t alias, uint16_t position, EcSlave * slave)
for (auto & iter : domain_map) {
// get the domain info, create if necessary
uint32_t domain_index = iter.first;
DomainInfo * domain_info = domain_info_[domain_index];
DomainInfo * domain_info = NULL;
if (domain_info_.count(domain_index)) {
domain_info = domain_info_.at(domain_index);
}
if (domain_info == NULL) {
domain_info = new DomainInfo(master_);
domain_info_[domain_index] = domain_info;
Expand Down Expand Up @@ -210,17 +215,20 @@ void EcMaster::registerPDOInDomain(
domain_info->domain_regs.back() = empty;
}

void EcMaster::activate()
bool EcMaster::activate()
{
// register domain
for (auto & iter : domain_info_) {
DomainInfo * domain_info = iter.second;
if (domain_info == NULL) {
throw std::runtime_error("Null domain info: " + std::to_string(iter.first));
}
bool domain_status = ecrt_domain_reg_pdo_entry_list(
domain_info->domain,
&(domain_info->domain_regs[0]));
if (domain_status) {
printWarning("Activate. Failed to register domain PDO entries.");
return;
return false;
}
}
// set application time
Expand All @@ -232,26 +240,33 @@ void EcMaster::activate()
bool activate_status = ecrt_master_activate(master_);
if (activate_status) {
printWarning("Activate. Failed to activate master.");
return;
return false;
}

// retrieve domain data
for (auto & iter : domain_info_) {
DomainInfo * domain_info = iter.second;
if (domain_info == NULL) {
throw std::runtime_error("Null domain info: " + std::to_string(iter.first));
}
domain_info->domain_pd = ecrt_domain_data(domain_info->domain);
if (domain_info->domain_pd == NULL) {
printWarning("Activate. Failed to retrieve domain process data.");
return;
return false;
}
}
return true;
}

void EcMaster::update(uint32_t domain)
{
// receive process data
ecrt_master_receive(master_);

DomainInfo * domain_info = domain_info_[domain];
DomainInfo * domain_info = domain_info_.at(domain);
if (domain_info == NULL) {
throw std::runtime_error("Null domain info: " + std::to_string(domain));
}

ecrt_domain_process(domain_info->domain);

Expand Down Expand Up @@ -290,7 +305,10 @@ void EcMaster::readData(uint32_t domain)
// receive process data
ecrt_master_receive(master_);

DomainInfo * domain_info = domain_info_[domain];
DomainInfo * domain_info = domain_info_.at(domain);
if (domain_info == NULL) {
throw std::runtime_error("Null domain info: " + std::to_string(domain));
}

ecrt_domain_process(domain_info->domain);

Expand All @@ -315,7 +333,11 @@ void EcMaster::readData(uint32_t domain)

void EcMaster::writeData(uint32_t domain)
{
DomainInfo * domain_info = domain_info_[domain];
DomainInfo * domain_info = domain_info_.at(domain);
if (domain_info == NULL) {
throw std::runtime_error("Null domain info: " + std::to_string(domain));
}

// read and write process data
for (DomainInfo::Entry & entry : domain_info->entries) {
for (int i = 0; i < entry.num_pdos; ++i) {
Expand Down Expand Up @@ -426,7 +448,10 @@ void EcMaster::setThreadRealTime()

void EcMaster::checkDomainState(uint32_t domain)
{
DomainInfo * domain_info = domain_info_[domain];
DomainInfo * domain_info = domain_info_.at(domain);
if (domain_info == NULL) {
throw std::runtime_error("Null domain info: " + std::to_string(domain));
}

ec_domain_state_t ds;
ecrt_domain_state(domain_info->domain, &ds);
Expand Down

2 comments on commit 52be2c2

@wei1224hf
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this fixbug , is about , when it start up , the motor would run to position 0 quickly ,
right?

@yguel
Copy link
Contributor

@yguel yguel commented on 52be2c2 Jun 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi wei1224hf,
It should not change the behaviour of the motor at start-up.
What it does, is that the actual behaviour of the library is to initiate communication with the hardware in the on_activate step (https://control.ros.org/master/_images/hardware_interface_lifecycle.png) which does not respect the hardware interface lifecycle (should be in the on_configure state).
The library is actualing relying on going very quickly to the on_activate step as soon as the configure() call is made which hides the problem for most of the users (Alas! this does not work all the time, it is still cheating after all).
This good fix by tony-laza prevents some crash when communication is attempted before the true call to activate() is performed.
We have a plan to truly follow the hardware interface lifecycle, but in the meantime, this fix will do a good job.

I hope it answers your question.
If you experience new unexpected behaviour with this commit, please report it to us. We will be very eager to fix the issue.

  • all the best,

Please sign in to comment.