Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add optional: max retries for HC, timeout for HCs #604

Merged
merged 4 commits into from Aug 17, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -46,6 +46,9 @@ public class SingularityDeploy {
private final Optional<Boolean> skipHealthchecksOnDeploy;
private final Optional<HealthcheckProtocol> healthcheckProtocol;

private final Optional<Integer> healthcheckMaxRetries;
private final Optional<Long> healthcheckMaxTotalTimeoutSeconds;

private final Optional<Long> deployHealthTimeoutSeconds;

private final Optional<Long> considerHealthyAfterRunningForSeconds;
Expand Down Expand Up @@ -80,6 +83,8 @@ public SingularityDeploy(@JsonProperty("requestId") String requestId,
@JsonProperty("healthcheckUri") Optional<String> healthcheckUri,
@JsonProperty("healthcheckIntervalSeconds") Optional<Long> healthcheckIntervalSeconds,
@JsonProperty("healthcheckTimeoutSeconds") Optional<Long> healthcheckTimeoutSeconds,
@JsonProperty("healthcheckMaxRetries") Optional<Integer> healthcheckMaxRetries,
@JsonProperty("healthcheckMaxTotalTimeoutSeconds") Optional<Long> healthcheckMaxTotalTimeoutSeconds,
@JsonProperty("serviceBasePath") Optional<String> serviceBasePath,
@JsonProperty("loadBalancerGroups") Optional<List<String>> loadBalancerGroups,
@JsonProperty("considerHealthyAfterRunningForSeconds") Optional<Long> considerHealthyAfterRunningForSeconds,
Expand Down Expand Up @@ -114,6 +119,9 @@ public SingularityDeploy(@JsonProperty("requestId") String requestId,
this.skipHealthchecksOnDeploy = skipHealthchecksOnDeploy;
this.healthcheckProtocol = healthcheckProtocol;

this.healthcheckMaxRetries = healthcheckMaxRetries;
this.healthcheckMaxTotalTimeoutSeconds = healthcheckMaxTotalTimeoutSeconds;

this.considerHealthyAfterRunningForSeconds = considerHealthyAfterRunningForSeconds;

this.deployHealthTimeoutSeconds = deployHealthTimeoutSeconds;
Expand All @@ -139,6 +147,9 @@ public SingularityDeployBuilder toBuilder() {
.setSkipHealthchecksOnDeploy(skipHealthchecksOnDeploy)
.setHealthcheckProtocol(healthcheckProtocol)

.setHealthcheckMaxRetries(healthcheckMaxRetries)
.setHealthcheckMaxTotalTimeoutSeconds(healthcheckMaxTotalTimeoutSeconds)

.setConsiderHealthyAfterRunningForSeconds(considerHealthyAfterRunningForSeconds)
.setDeployHealthTimeoutSeconds(deployHealthTimeoutSeconds)
.setServiceBasePath(serviceBasePath)
Expand All @@ -153,7 +164,7 @@ public SingularityDeployBuilder toBuilder() {
.setExecutorData(executorData);
}

@ApiModelProperty(required=false, value="Number of seconds that singularity waits for this service to become healthy.")
@ApiModelProperty(required=false, value="Number of seconds that Singularity waits for this service to become healthy (for it to download artifacts, start running, and optionally pass healthchecks.)")
public Optional<Long> getDeployHealthTimeoutSeconds() {
return deployHealthTimeoutSeconds;
}
Expand Down Expand Up @@ -241,22 +252,22 @@ public Optional<ExecutorData> getExecutorData() {
return executorData;
}

@ApiModelProperty(required=false, value="Deployment Healthcheck URI.")
@ApiModelProperty(required=false, value="Deployment Healthcheck URI, if specified will be called after TASK_RUNNING.")
public Optional<String> getHealthcheckUri() {
return healthcheckUri;
}

@ApiModelProperty(required=false, value="Healthcheck protocol")
@ApiModelProperty(required=false, value="Healthcheck protocol - HTTP or HTTPS")
public Optional<HealthcheckProtocol> getHealthcheckProtocol() {
return healthcheckProtocol;
}

@ApiModelProperty(required=false, value="Health check interval in seconds.")
@ApiModelProperty(required=false, value="Time to wait after a failed healthcheck to try again in seconds.")
public Optional<Long> getHealthcheckIntervalSeconds() {
return healthcheckIntervalSeconds;
}

@ApiModelProperty(required=false, value="Health check timeout in seconds.")
@ApiModelProperty(required=false, value="Single healthcheck HTTP timeout in seconds.")
public Optional<Long> getHealthcheckTimeoutSeconds() {
return healthcheckTimeoutSeconds;
}
Expand Down Expand Up @@ -286,36 +297,50 @@ public Optional<Boolean> getSkipHealthchecksOnDeploy() {
return skipHealthchecksOnDeploy;
}

@ApiModelProperty(required=false, value="Maximum number of times to retry an individual healthcheck before failing the deploy.")
public Optional<Integer> getHealthcheckMaxRetries() {
return healthcheckMaxRetries;
}

@ApiModelProperty(required=false, value="Maximum amount of time to wait before failing a deploy for healthchecks to pass.")
public Optional<Long> getHealthcheckMaxTotalTimeoutSeconds() {
return healthcheckMaxTotalTimeoutSeconds;
}

@Override
public String toString() {
return "SingularityDeploy [" +
"requestId='" + requestId + '\'' +
", id='" + id + '\'' +
", version=" + version +
", timestamp=" + timestamp +
", metadata=" + metadata +
", containerInfo=" + containerInfo +
", customExecutorCmd=" + customExecutorCmd +
", customExecutorId=" + customExecutorId +
", customExecutorSource=" + customExecutorSource +
", customExecutorResources=" + customExecutorResources +
", customExecutorUser=" + customExecutorUser +
", resources=" + resources +
", command=" + command +
", arguments=" + arguments +
", env=" + env +
", uris=" + uris +
", executorData=" + executorData +
", healthcheckUri=" + healthcheckUri +
", healthcheckIntervalSeconds=" + healthcheckIntervalSeconds +
", healthcheckTimeoutSeconds=" + healthcheckTimeoutSeconds +
", skipHealthchecksOnDeploy=" + skipHealthchecksOnDeploy +
", deployHealthTimeoutSeconds=" + deployHealthTimeoutSeconds +
", considerHealthyAfterRunningForSeconds=" + considerHealthyAfterRunningForSeconds +
", healthcheckProtocol=" + healthcheckProtocol +
", serviceBasePath=" + serviceBasePath +
", loadBalancerGroups=" + loadBalancerGroups +
", loadBalancerOptions=" + loadBalancerOptions +
']';
return "SingularityDeploy{" +
"requestId='" + requestId + '\'' +
", id='" + id + '\'' +
", version=" + version +
", timestamp=" + timestamp +
", metadata=" + metadata +
", containerInfo=" + containerInfo +
", customExecutorCmd=" + customExecutorCmd +
", customExecutorId=" + customExecutorId +
", customExecutorSource=" + customExecutorSource +
", customExecutorResources=" + customExecutorResources +
", customExecutorUser=" + customExecutorUser +
", resources=" + resources +
", command=" + command +
", arguments=" + arguments +
", env=" + env +
", uris=" + uris +
", executorData=" + executorData +
", healthcheckUri=" + healthcheckUri +
", healthcheckIntervalSeconds=" + healthcheckIntervalSeconds +
", healthcheckTimeoutSeconds=" + healthcheckTimeoutSeconds +
", skipHealthchecksOnDeploy=" + skipHealthchecksOnDeploy +
", healthcheckProtocol=" + healthcheckProtocol +
", healthcheckMaxRetries=" + healthcheckMaxRetries +
", healthcheckMaxTotalTimeoutSeconds=" + healthcheckMaxTotalTimeoutSeconds +
", deployHealthTimeoutSeconds=" + deployHealthTimeoutSeconds +
", considerHealthyAfterRunningForSeconds=" + considerHealthyAfterRunningForSeconds +
", serviceBasePath=" + serviceBasePath +
", loadBalancerGroups=" + loadBalancerGroups +
", loadBalancerOptions=" + loadBalancerOptions +
'}';
}


}
Expand Up @@ -40,6 +40,9 @@ public class SingularityDeployBuilder {
private Optional<Boolean> skipHealthchecksOnDeploy;
private Optional<HealthcheckProtocol> healthcheckProtocol;

private Optional<Integer> healthcheckMaxRetries;
private Optional<Long> healthcheckMaxTotalTimeoutSeconds;

private Optional<Long> deployHealthTimeoutSeconds;

private Optional<Long> considerHealthyAfterRunningForSeconds;
Expand Down Expand Up @@ -72,15 +75,18 @@ public SingularityDeployBuilder(String requestId, String id) {
this.skipHealthchecksOnDeploy = Optional.absent();
this.deployHealthTimeoutSeconds = Optional.absent();
this.healthcheckProtocol = Optional.absent();
this.healthcheckMaxTotalTimeoutSeconds = Optional.absent();
this.healthcheckMaxRetries = Optional.absent();
this.considerHealthyAfterRunningForSeconds = Optional.absent();
this.serviceBasePath = Optional.absent();
this.loadBalancerGroups = Optional.absent();
this.loadBalancerOptions = Optional.absent();
}

public SingularityDeploy build() {
return new SingularityDeploy(requestId, id, command, arguments, containerInfo, customExecutorCmd, customExecutorId, customExecutorSource, customExecutorResources, customExecutorUser, resources, env, uris, metadata, executorData, version, timestamp, deployHealthTimeoutSeconds, healthcheckUri, healthcheckIntervalSeconds,
healthcheckTimeoutSeconds, serviceBasePath, loadBalancerGroups, considerHealthyAfterRunningForSeconds, loadBalancerOptions, skipHealthchecksOnDeploy, healthcheckProtocol);
return new SingularityDeploy(requestId, id, command, arguments, containerInfo, customExecutorCmd, customExecutorId, customExecutorSource, customExecutorResources, customExecutorUser, resources, env,
uris, metadata, executorData, version, timestamp, deployHealthTimeoutSeconds, healthcheckUri, healthcheckIntervalSeconds, healthcheckTimeoutSeconds, healthcheckMaxRetries,
healthcheckMaxTotalTimeoutSeconds, serviceBasePath, loadBalancerGroups, considerHealthyAfterRunningForSeconds, loadBalancerOptions, skipHealthchecksOnDeploy, healthcheckProtocol);
}

public String getRequestId() {
Expand Down Expand Up @@ -321,36 +327,57 @@ public SingularityDeployBuilder setHealthcheckProtocol(Optional<HealthcheckProto
return this;
}

public Optional<Integer> getHealthcheckMaxRetries() {
return healthcheckMaxRetries;
}

public Optional<Long> getHealthcheckMaxTotalTimeoutSeconds() {
return healthcheckMaxTotalTimeoutSeconds;
}

public SingularityDeployBuilder setHealthcheckMaxRetries(Optional<Integer> healthcheckMaxRetries) {
this.healthcheckMaxRetries = healthcheckMaxRetries;
return this;
}

public SingularityDeployBuilder setHealthcheckMaxTotalTimeoutSeconds(Optional<Long> healthcheckMaxTotalTimeoutSeconds) {
this.healthcheckMaxTotalTimeoutSeconds = healthcheckMaxTotalTimeoutSeconds;
return this;
}

@Override
public String toString() {
return "SingularityDeployBuilder [" +
"requestId='" + requestId + '\'' +
", id='" + id + '\'' +
", version=" + version +
", timestamp=" + timestamp +
", metadata=" + metadata +
", containerInfo=" + containerInfo +
", customExecutorCmd=" + customExecutorCmd +
", customExecutorId=" + customExecutorId +
", customExecutorSource=" + customExecutorSource +
", customExecutorResources=" + customExecutorResources +
", customExecutorUser=" + customExecutorUser +
", resources=" + resources +
", command=" + command +
", arguments=" + arguments +
", env=" + env +
", uris=" + uris +
", executorData=" + executorData +
", healthcheckUri=" + healthcheckUri +
", healthcheckIntervalSeconds=" + healthcheckIntervalSeconds +
", healthcheckTimeoutSeconds=" + healthcheckTimeoutSeconds +
", skipHealthchecksOnDeploy=" + skipHealthchecksOnDeploy +
", deployHealthTimeoutSeconds=" + deployHealthTimeoutSeconds +
", considerHealthyAfterRunningForSeconds=" + considerHealthyAfterRunningForSeconds +
", healthcheckProtocol=" + healthcheckProtocol +
", serviceBasePath=" + serviceBasePath +
", loadBalancerGroups=" + loadBalancerGroups +
", loadBalancerOptions=" + loadBalancerOptions +
']';
return "SingularityDeployBuilder{" +
"requestId='" + requestId + '\'' +
", id='" + id + '\'' +
", version=" + version +
", timestamp=" + timestamp +
", metadata=" + metadata +
", containerInfo=" + containerInfo +
", customExecutorCmd=" + customExecutorCmd +
", customExecutorId=" + customExecutorId +
", customExecutorSource=" + customExecutorSource +
", customExecutorResources=" + customExecutorResources +
", customExecutorUser=" + customExecutorUser +
", resources=" + resources +
", command=" + command +
", arguments=" + arguments +
", env=" + env +
", uris=" + uris +
", executorData=" + executorData +
", healthcheckUri=" + healthcheckUri +
", healthcheckIntervalSeconds=" + healthcheckIntervalSeconds +
", healthcheckTimeoutSeconds=" + healthcheckTimeoutSeconds +
", skipHealthchecksOnDeploy=" + skipHealthchecksOnDeploy +
", healthcheckProtocol=" + healthcheckProtocol +
", healthcheckMaxRetries=" + healthcheckMaxRetries +
", healthcheckMaxTotalTimeoutSeconds=" + healthcheckMaxTotalTimeoutSeconds +
", deployHealthTimeoutSeconds=" + deployHealthTimeoutSeconds +
", considerHealthyAfterRunningForSeconds=" + considerHealthyAfterRunningForSeconds +
", serviceBasePath=" + serviceBasePath +
", loadBalancerGroups=" + loadBalancerGroups +
", loadBalancerOptions=" + loadBalancerOptions +
'}';
}

}
Expand Up @@ -319,6 +319,10 @@ public Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> getTaskHistory
return map;
}

public int getNumHealthchecks(SingularityTaskId taskId) {
return getNumChildren(getHealthcheckParentPath(taskId));
}

public List<SingularityTaskHealthcheckResult> getHealthcheckResults(SingularityTaskId taskId) {
List<SingularityTaskHealthcheckResult> healthcheckResults = getAsyncChildren(getHealthcheckParentPath(taskId), healthcheckResultTranscoder);
Collections.sort(healthcheckResults);
Expand Down
Expand Up @@ -44,7 +44,7 @@ public DeployHealth getDeployHealth(final Optional<SingularityDeploy> deploy, fi
if (!deploy.isPresent() || !deploy.get().getHealthcheckUri().isPresent() || (isDeployPending && deploy.get().getSkipHealthchecksOnDeploy().or(false))) {
return getNoHealthcheckDeployHealth(deploy, activeTasks);
} else {
return getHealthCheckDeployState(activeTasks);
return getHealthcheckDeployState(deploy.get(), activeTasks, isDeployPending);
}
}

Expand Down Expand Up @@ -86,17 +86,46 @@ private DeployHealth getNoHealthcheckDeployHealth(final Optional<SingularityDepl
return DeployHealth.HEALTHY;
}

private DeployHealth getHealthCheckDeployState(final Collection<SingularityTaskId> matchingActiveTasks) {
private DeployHealth getHealthcheckDeployState(final SingularityDeploy deploy, final Collection<SingularityTaskId> matchingActiveTasks, final boolean isDeployPending) {
Map<SingularityTaskId, SingularityTaskHealthcheckResult> healthcheckResults = taskManager.getLastHealthcheck(matchingActiveTasks);

for (SingularityTaskId taskId : matchingActiveTasks) {
SingularityTaskHealthcheckResult healthcheckResult = healthcheckResults.get(taskId);

if (healthcheckResult == null) {
LOG.debug("No health check present for {}", taskId);
LOG.debug("No healthcheck present for {}", taskId);
return DeployHealth.WAITING;
} else if (healthcheckResult.isFailed()) {
LOG.debug("Found a failed health check: {}", healthcheckResult);
LOG.debug("Found a failed healthcheck: {}", healthcheckResult);

if (deploy.getHealthcheckMaxRetries().isPresent() && taskManager.getNumHealthchecks(taskId) > deploy.getHealthcheckMaxRetries().get()) {
LOG.debug("{} failed {} healthchecks, the max for the deploy", taskId, deploy.getHealthcheckMaxRetries().get());
return DeployHealth.UNHEALTHY;
}

if (isDeployPending && deploy.getHealthcheckMaxTotalTimeoutSeconds().isPresent()) {
Collection<SingularityTaskHistoryUpdate> updates = taskManager.getTaskHistoryUpdates(taskId);

long runningAt = 0;

for (SingularityTaskHistoryUpdate update : updates) {
if (update.getTaskState() == ExtendedTaskState.TASK_RUNNING) {
runningAt = update.getTimestamp();
break;
}
}

if (runningAt > 0) {
final long durationSinceRunning = System.currentTimeMillis() - runningAt;

if (durationSinceRunning > TimeUnit.SECONDS.toMillis(deploy.getHealthcheckMaxTotalTimeoutSeconds().get())) {
LOG.debug("{} has been running for {} and has yet to pass healthchecks, failing deploy", taskId, JavaUtils.durationFromMillis(durationSinceRunning));

return DeployHealth.UNHEALTHY;
}
}
}

return DeployHealth.WAITING;
}
}
Expand Down
Expand Up @@ -5,7 +5,6 @@

import com.google.common.base.Optional;
import com.google.common.collect.ImmutableMap;
import com.hubspot.singularity.SingularityAbort;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskHealthcheckResult;
import com.hubspot.singularity.config.SingularityConfiguration;
Expand All @@ -24,15 +23,14 @@ public class SingularityHealthcheckAsyncHandler extends AsyncCompletionHandler<R
private final SingularityNewTaskChecker newTaskChecker;
private final SingularityTask task;
private final TaskManager taskManager;
private final SingularityAbort abort;
private final int maxHealthcheckResponseBodyBytes;

public SingularityHealthcheckAsyncHandler(SingularityExceptionNotifier exceptionNotifier, SingularityConfiguration configuration, SingularityHealthchecker healthchecker, SingularityNewTaskChecker newTaskChecker, TaskManager taskManager, SingularityAbort abort, SingularityTask task) {
public SingularityHealthcheckAsyncHandler(SingularityExceptionNotifier exceptionNotifier, SingularityConfiguration configuration, SingularityHealthchecker healthchecker,
SingularityNewTaskChecker newTaskChecker, TaskManager taskManager, SingularityTask task) {
this.exceptionNotifier = exceptionNotifier;
this.taskManager = taskManager;
this.newTaskChecker = newTaskChecker;
this.healthchecker = healthchecker;
this.abort = abort;
this.task = task;
this.maxHealthcheckResponseBodyBytes = configuration.getMaxHealthcheckResponseBodyBytes();

Expand Down