Skip to content

Commit

Permalink
wait for resources to be running stably
Browse files Browse the repository at this point in the history
Currently, we only check that the created resource becomes InUse on any
node. However, if the resource agent encounters an error on startup, it
will briefly become InUse on one node, but it will move away from that
node once startup fails.

This is especially confusing to users because linstor-gateway will show
the resource as "running" (which actually means something closer to
"deployed"), but in reality it is hopping between nodes, unable to
start.

So, when deploying a resource, wait a few seconds to see if the LINSTOR
resource stays on one node consistently. Also, if it does fail to
deploy, make sure to remove the reactor config file from LINSTOR's
database. That way, the creation process should be fully rolled back if
something goes wrong.

There are two scenarios that this does not cover:
* Deployment only fails on one node, then switches to another and
  succeeds. This would be seen as a failure to deploy.
* The resource agent takes longer than 5 seconds to fail.

Both are firmly in the "too bad" category, but we may still want to
extend the behavior in the future to cover these cases.
  • Loading branch information
chrboe committed Apr 4, 2024
1 parent 750c2a2 commit 0a59e3e
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 0 deletions.
44 changes: 44 additions & 0 deletions pkg/common/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,47 @@ func WaitUntilResourceCondition(ctx context.Context, cli *client.Client, name st
time.Sleep(3 * time.Second)
}
}

func getInUseNode(ctx context.Context, cli *client.Client, name string) (string, error) {
resources, err := cli.Resources.GetResourceView(ctx, &client.ListOpts{Resource: []string{name}})
if err != nil {
return "", err
}
for _, resource := range resources {
if resource.State != nil && resource.State.InUse != nil && *resource.State.InUse {
return resource.NodeName, nil
}
}

return "", nil
}

// AssertResourceInUseStable records the node that a resource is running on, and then monitors the resource to check
// that it remains on the same node for a few seconds. This is useful for sanity-checking that a resource has started
// up and is healthy.
func AssertResourceInUseStable(ctx context.Context, cli *client.Client, name string) error {
initialNode, err := getInUseNode(ctx, cli, name)
if err != nil {
return fmt.Errorf("failed to get InUse node for resource %s: %w", name, err)
}
if initialNode == "" {
return fmt.Errorf("resource %s is not in use on any node", name)
}

count := 0
for {
node, err := getInUseNode(ctx, cli, name)
if err != nil {
return err
}
if node != initialNode {
return fmt.Errorf("resource startup failed on node %s", initialNode)
}

time.Sleep(1 * time.Second)
count++
if count > 5 {
return nil
}
}
}
19 changes: 19 additions & 0 deletions pkg/iscsi/iscsi.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,20 @@ func (i *ISCSI) Create(ctx context.Context, rsc *ResourceConfig) (*ResourceConfi
return nil, fmt.Errorf("failed to register reactor config file: %w", err)
}

defer func() {
// if we fail beyond this point, delete the just created reactor config
if err != nil {
log.Debugf("Rollback: deleting just created reactor config %s", rsc.ID())
if err := reactor.DeleteConfig(ctx, i.cli.Client, rsc.ID()); err != nil {
log.Warnf("Failed to roll back created reactor config: %v", err)
}

if err := common.WaitUntilResourceCondition(ctx, i.cli.Client, rsc.IQN.WWN(), common.NoResourcesInUse); err != nil {
log.Warnf("Failed to wait for resource to become unused: %v", err)
}
}
}()

_, err = i.Start(ctx, rsc.IQN)
if err != nil {
return nil, fmt.Errorf("failed to start resources: %w", err)
Expand Down Expand Up @@ -193,6 +207,11 @@ func (i *ISCSI) Start(ctx context.Context, iqn Iqn) (*ResourceConfig, error) {
return nil, fmt.Errorf("error waiting for resource to become used: %w", err)
}

err = common.AssertResourceInUseStable(waitCtx, i.cli.Client, iqn.WWN())
if err != nil {
return nil, fmt.Errorf("error waiting for resource to become stable: %w", err)
}

return i.Get(ctx, iqn)
}

Expand Down
19 changes: 19 additions & 0 deletions pkg/nfs/nfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,20 @@ func (n *NFS) Create(ctx context.Context, rsc *ResourceConfig) (*ResourceConfig,
return nil, fmt.Errorf("failed to register reactor config file: %w", err)
}

defer func() {
// if we fail beyond this point, delete the just created reactor config
if err != nil {
log.Debugf("Rollback: deleting just created reactor config %s", rsc.ID())
if err := reactor.DeleteConfig(ctx, n.cli.Client, rsc.ID()); err != nil {
log.Warnf("Failed to roll back created reactor config: %v", err)
}

if err := common.WaitUntilResourceCondition(ctx, n.cli.Client, rsc.Name, common.NoResourcesInUse); err != nil {
log.Warnf("Failed to wait for resource to become unused: %v", err)
}
}
}()

_, err = n.Start(ctx, rsc.Name)
if err != nil {
return nil, fmt.Errorf("failed to start resources: %w", err)
Expand Down Expand Up @@ -227,6 +241,11 @@ func (n *NFS) Start(ctx context.Context, name string) (*ResourceConfig, error) {
return nil, fmt.Errorf("error waiting for resource to become used: %w", err)
}

err = common.AssertResourceInUseStable(waitCtx, n.cli.Client, name)
if err != nil {
return nil, fmt.Errorf("error waiting for resource to become stable: %w", err)
}

return n.Get(ctx, name)
}

Expand Down
18 changes: 18 additions & 0 deletions pkg/nvmeof/nvmeof.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,20 @@ func (n *NVMeoF) Create(ctx context.Context, rsc *ResourceConfig) (*ResourceConf
return nil, fmt.Errorf("failed to register reactor config file: %w", err)
}

defer func() {
// if we fail beyond this point, delete the just created reactor config
if err != nil {
log.Debugf("Rollback: deleting just created reactor config %s", rsc.ID())
if err := reactor.DeleteConfig(ctx, n.cli.Client, rsc.ID()); err != nil {
log.Warnf("Failed to roll back created reactor config: %v", err)
}

if err := common.WaitUntilResourceCondition(ctx, n.cli.Client, rsc.NQN.Subsystem(), common.NoResourcesInUse); err != nil {
log.Warnf("Failed to wait for resource to become unused: %v", err)
}
}
}()

_, err = n.Start(ctx, rsc.NQN)
if err != nil {
return nil, fmt.Errorf("failed to start resources: %w", err)
Expand Down Expand Up @@ -200,6 +214,10 @@ func (n *NVMeoF) Start(ctx context.Context, nqn Nqn) (*ResourceConfig, error) {
return nil, fmt.Errorf("error waiting for resource to become used: %w", err)
}

err = common.AssertResourceInUseStable(waitCtx, n.cli.Client, nqn.Subsystem())
if err != nil {
return nil, fmt.Errorf("error waiting for resource to become stable: %w", err)
}
return n.Get(ctx, nqn)
}

Expand Down

0 comments on commit 0a59e3e

Please sign in to comment.