From ecfc0472bc70d58618844c6caaf6da0e0b8a555f Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Tue, 23 Aug 2016 13:45:28 -0400 Subject: [PATCH] very basic detection --- .../SingularityDisabledAction.java | 14 +- .../SingularityDisabledActionType.java | 2 +- .../singularity/SingularityDisasterStats.java | 99 +++++++++++++ .../singularity/SingularityDisasterType.java | 5 + .../DisasterDetectionConfiguration.java | 103 +++++++++++++- ...ctionManager.java => DisasterManager.java} | 55 ++++++-- .../data/SingularityDataModule.java | 2 +- .../data/SingularityValidator.java | 10 +- .../SingularityTranscoderModule.java | 2 + ...nsResource.java => DisastersResource.java} | 27 ++-- .../resources/SingularityResourceModule.java | 2 +- .../SingularityDisasterDetectionPoller.java | 133 +++++++++++++++++- .../SingularityTaskReconciliationPoller.java | 17 ++- .../app/actions/api/disabledActions.es6 | 6 +- 14 files changed, 432 insertions(+), 45 deletions(-) create mode 100644 SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterStats.java create mode 100644 SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterType.java rename SingularityService/src/main/java/com/hubspot/singularity/data/{DisabledActionManager.java => DisasterManager.java} (52%) rename SingularityService/src/main/java/com/hubspot/singularity/resources/{DisabledActionsResource.java => DisastersResource.java} (68%) diff --git a/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledAction.java b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledAction.java index 1bac47ab45..23633f0755 100644 --- a/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledAction.java +++ b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledAction.java @@ -9,12 +9,14 @@ public class SingularityDisabledAction { private final SingularityDisabledActionType type; private final String message; private final Optional user; + private final boolean systemGenerated; @JsonCreator - public SingularityDisabledAction(@JsonProperty("type") SingularityDisabledActionType type, @JsonProperty("message") String message, @JsonProperty("user") Optional user) { + public SingularityDisabledAction(@JsonProperty("type") SingularityDisabledActionType type, @JsonProperty("message") String message, @JsonProperty("user") Optional user, @JsonProperty("systemGenerated") boolean systemGenerated) { this.type = type; this.message = message; this.user = user; + this.systemGenerated = systemGenerated; } public SingularityDisabledActionType getType() { @@ -29,6 +31,10 @@ public Optional getUser() { return user; } + public boolean isSystemGenerated() { + return systemGenerated; + } + @Override public boolean equals(Object o) { if (this == o) { @@ -38,14 +44,15 @@ public boolean equals(Object o) { return false; } SingularityDisabledAction that = (SingularityDisabledAction) o; - return type == that.type && + return systemGenerated == that.systemGenerated && + type == that.type && Objects.equal(message, that.message) && Objects.equal(user, that.user); } @Override public int hashCode() { - return Objects.hashCode(type, message, user); + return Objects.hashCode(type, message, user, systemGenerated); } @Override @@ -54,6 +61,7 @@ public String toString() { .add("type", type) .add("message", message) .add("user", user) + .add("systemGenerated", systemGenerated) .toString(); } } diff --git a/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledActionType.java b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledActionType.java index 4c5afb2d1c..41bb2d18f6 100644 --- a/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledActionType.java +++ b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisabledActionType.java @@ -1,5 +1,5 @@ package com.hubspot.singularity; public enum SingularityDisabledActionType { - BOUNCE, DEPLOY, SCALE, REMOVE, DECOMMISSION + BOUNCE, DEPLOY, SCALE, REMOVE, DECOMMISSION, TASK_RECONCILIATION } diff --git a/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterStats.java b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterStats.java new file mode 100644 index 0000000000..99d12c0301 --- /dev/null +++ b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterStats.java @@ -0,0 +1,99 @@ +package com.hubspot.singularity; + +import java.util.Collections; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.Objects; + +public class SingularityDisasterStats { + private final long timestamp; + private final int numActiveTasks; + private final int numPendingTasks; + private final int numOverdueTasks; + private final long avgTaskLagMillis; + private final int numActiveSlaves; + private final int numLostSlaves; + + @JsonCreator + public SingularityDisasterStats(@JsonProperty("timestamp") long timestamp, + @JsonProperty("numActiveTasks") int numActiveTasks, + @JsonProperty("numPendingTasks") int numPendingTasks, + @JsonProperty("numOverdueTasks") int numOverdueTasks, + @JsonProperty("avgTaskLagMillis") long avgTaskLagMillis, + @JsonProperty("numActiveSlaves") int numActiveSlaves, + @JsonProperty("numLostSlaves") int numLostSlaves) { + this.timestamp = timestamp; + this.numActiveTasks = numActiveTasks; + this.numPendingTasks = numPendingTasks; + this.numOverdueTasks = numOverdueTasks; + this.avgTaskLagMillis = avgTaskLagMillis; + this.numActiveSlaves = numActiveSlaves; + this.numLostSlaves = numLostSlaves; + } + + public long getTimestamp() { + return timestamp; + } + + public int getNumActiveTasks() { + return numActiveTasks; + } + + public int getNumPendingTasks() { + return numPendingTasks; + } + + public int getNumOverdueTasks() { + return numOverdueTasks; + } + + public long getAvgTaskLagMillis() { + return avgTaskLagMillis; + } + + public int getNumActiveSlaves() { + return numActiveSlaves; + } + + public int getNumLostSlaves() { + return numLostSlaves; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SingularityDisasterStats that = (SingularityDisasterStats) o; + return timestamp == that.timestamp && + numActiveTasks == that.numActiveTasks && + numPendingTasks == that.numPendingTasks && + numOverdueTasks == that.numOverdueTasks && + avgTaskLagMillis == that.avgTaskLagMillis && + numActiveSlaves == that.numActiveSlaves && + numLostSlaves == that.numLostSlaves; + } + + @Override + public int hashCode() { + return Objects.hashCode(timestamp, numActiveTasks, numPendingTasks, numOverdueTasks, avgTaskLagMillis, numActiveSlaves, numLostSlaves); + } + + @Override + public String toString() { + return Objects.toStringHelper(this) + .add("timestamp", timestamp) + .add("numActiveTasks", numActiveTasks) + .add("numPendingTasks", numPendingTasks) + .add("numOverdueTasks", numOverdueTasks) + .add("avgTaskLagMillis", avgTaskLagMillis) + .add("numActiveSlaves", numActiveSlaves) + .add("numLostSlaves", numLostSlaves) + .toString(); + } +} diff --git a/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterType.java b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterType.java new file mode 100644 index 0000000000..b3303ca39c --- /dev/null +++ b/SingularityBase/src/main/java/com/hubspot/singularity/SingularityDisasterType.java @@ -0,0 +1,5 @@ +package com.hubspot.singularity; + +public enum SingularityDisasterType { + EXCESSIVE_TASK_LAG, LOST_SLAVES; +} diff --git a/SingularityService/src/main/java/com/hubspot/singularity/config/DisasterDetectionConfiguration.java b/SingularityService/src/main/java/com/hubspot/singularity/config/DisasterDetectionConfiguration.java index b5c0babb4f..64ea7c2ace 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/config/DisasterDetectionConfiguration.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/config/DisasterDetectionConfiguration.java @@ -1,20 +1,49 @@ package com.hubspot.singularity.config; +import java.util.List; import java.util.concurrent.TimeUnit; import javax.validation.constraints.NotNull; import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.hubspot.singularity.SingularityDisabledActionType; public class DisasterDetectionConfiguration { @JsonProperty private boolean enabled = false; + @JsonProperty + private long runEveryMillis = TimeUnit.SECONDS.toMillis(30); + + @JsonProperty + private long considerOverdueAfterMillis = TimeUnit.MINUTES.toMillis(1); + @JsonProperty @NotNull - private long runEveryMillis = TimeUnit.SECONDS.toMillis(15); + private List disableActionsOnDisaster = ImmutableList.of(SingularityDisabledActionType.BOUNCE, SingularityDisabledActionType.DEPLOY, SingularityDisabledActionType.TASK_RECONCILIATION); + + @JsonProperty + private boolean checkOverdueTasks = true; + + @JsonProperty + private long criticalAvgTaskLagMillis = 300000L; + + @JsonProperty + private double criticalOverdueTaskPortion = 0.2; + @JsonProperty + private boolean requireAllConditionsForOverdueTaskDisaster = true; + + @JsonProperty + private boolean checkLostSlaves = true; + + @JsonProperty + private double criticalLostSlavePortion = 0.2; + + @JsonProperty + private long checkLostSlavesInLastMillis = 60000; public boolean isEnabled() { return enabled; @@ -31,4 +60,76 @@ public long getRunEveryMillis() { public void setRunEveryMillis(long runEveryMillis) { this.runEveryMillis = runEveryMillis; } + + public long getConsiderOverdueAfterMillis() { + return considerOverdueAfterMillis; + } + + public void setConsiderOverdueAfterMillis(long considerOverdueAfterMillis) { + this.considerOverdueAfterMillis = considerOverdueAfterMillis; + } + + public List getDisableActionsOnDisaster() { + return disableActionsOnDisaster; + } + + public void setDisableActionsOnDisaster(List disableActionsOnDisaster) { + this.disableActionsOnDisaster = disableActionsOnDisaster; + } + + public boolean isCheckOverdueTasks() { + return checkOverdueTasks; + } + + public void setCheckOverdueTasks(boolean checkOverdueTasks) { + this.checkOverdueTasks = checkOverdueTasks; + } + + public long getCriticalAvgTaskLagMillis() { + return criticalAvgTaskLagMillis; + } + + public void setCriticalAvgTaskLagMillis(long criticalAvgTaskLagMillis) { + this.criticalAvgTaskLagMillis = criticalAvgTaskLagMillis; + } + + public double getCriticalOverdueTaskPortion() { + return criticalOverdueTaskPortion; + } + + public void setCriticalOverdueTaskPortion(double criticalOverdueTaskPortion) { + this.criticalOverdueTaskPortion = criticalOverdueTaskPortion; + } + + public boolean isRequireAllConditionsForOverdueTaskDisaster() { + return requireAllConditionsForOverdueTaskDisaster; + } + + public void setRequireAllConditionsForOverdueTaskDisaster(boolean requireAllConditionsForOverdueTaskDisaster) { + this.requireAllConditionsForOverdueTaskDisaster = requireAllConditionsForOverdueTaskDisaster; + } + + public boolean isCheckLostSlaves() { + return checkLostSlaves; + } + + public void setCheckLostSlaves(boolean checkLostSlaves) { + this.checkLostSlaves = checkLostSlaves; + } + + public double getCriticalLostSlavePortion() { + return criticalLostSlavePortion; + } + + public void setCriticalLostSlavePortion(double criticalLostSlavePortion) { + this.criticalLostSlavePortion = criticalLostSlavePortion; + } + + public long getCheckLostSlavesInLastMillis() { + return checkLostSlavesInLastMillis; + } + + public void setCheckLostSlavesInLastMillis(long checkLostSlavesInLastMillis) { + this.checkLostSlavesInLastMillis = checkLostSlavesInLastMillis; + } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/data/DisabledActionManager.java b/SingularityService/src/main/java/com/hubspot/singularity/data/DisasterManager.java similarity index 52% rename from SingularityService/src/main/java/com/hubspot/singularity/data/DisabledActionManager.java rename to SingularityService/src/main/java/com/hubspot/singularity/data/DisasterManager.java index 1c08cd0cb2..4dcef58180 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/data/DisabledActionManager.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/data/DisasterManager.java @@ -2,6 +2,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.curator.framework.CuratorFramework; @@ -14,26 +15,34 @@ import com.hubspot.singularity.SingularityDeleteResult; import com.hubspot.singularity.SingularityDisabledAction; import com.hubspot.singularity.SingularityDisabledActionType; +import com.hubspot.singularity.SingularityDisasterStats; +import com.hubspot.singularity.SingularityDisasterType; import com.hubspot.singularity.SingularityUser; import com.hubspot.singularity.config.SingularityConfiguration; import com.hubspot.singularity.data.transcoders.Transcoder; -public class DisabledActionManager extends CuratorAsyncManager { - private static final String DISABLED_ACTIONS_ROOT = "/disabled-actions"; +public class DisasterManager extends CuratorAsyncManager { + private static final String DISASTERS_ROOT = "/disasters"; + private static final String DISABLED_ACTIONS = DISASTERS_ROOT + "/disabled-actions"; + private static final String ACTIVE_DISASTERS = DISASTERS_ROOT + "/active"; + private static final String DISASTER_STATS = DISASTERS_ROOT + "/stats"; private static final String MESSAGE_FORMAT = "Cannot perform action %s: %s"; private static final String DEFAULT_MESSAGE = "Action is currently disabled"; private final Transcoder disabledActionTranscoder; + private final Transcoder disasterStatsTranscoder; @Inject - public DisabledActionManager(CuratorFramework curator, SingularityConfiguration configuration, MetricRegistry metricRegistry, Transcoder disabledActionTranscoder) { + public DisasterManager(CuratorFramework curator, SingularityConfiguration configuration, MetricRegistry metricRegistry, + Transcoder disabledActionTranscoder, Transcoder disasterStatsTranscoder) { super(curator, configuration, metricRegistry); this.disabledActionTranscoder = disabledActionTranscoder; + this.disasterStatsTranscoder = disasterStatsTranscoder; } private String getActionPath(SingularityDisabledActionType action) { - return ZKPaths.makePath(DISABLED_ACTIONS_ROOT, action.name()); + return ZKPaths.makePath(DISABLED_ACTIONS, action.name()); } public boolean isDisabled(SingularityDisabledActionType action) { @@ -42,14 +51,15 @@ public boolean isDisabled(SingularityDisabledActionType action) { public SingularityDisabledAction getDisabledAction(SingularityDisabledActionType action) { Optional maybeDisabledAction = getData(getActionPath(action), disabledActionTranscoder); - return maybeDisabledAction.or(new SingularityDisabledAction(action, String.format(MESSAGE_FORMAT, action, DEFAULT_MESSAGE), Optional.absent())); + return maybeDisabledAction.or(new SingularityDisabledAction(action, String.format(MESSAGE_FORMAT, action, DEFAULT_MESSAGE), Optional.absent(), false)); } - public SingularityCreateResult disable(SingularityDisabledActionType action, Optional maybeMessage, Optional user) { + public SingularityCreateResult disable(SingularityDisabledActionType action, Optional maybeMessage, Optional user, boolean systemGenerated) { SingularityDisabledAction disabledAction = new SingularityDisabledAction( action, String.format(MESSAGE_FORMAT, action, maybeMessage.or(DEFAULT_MESSAGE)), - user.isPresent() ? Optional.of(user.get().getId()) : Optional.absent()); + user.isPresent() ? Optional.of(user.get().getId()) : Optional.absent(), + systemGenerated); return save(getActionPath(action), disabledAction, disabledActionTranscoder); } @@ -60,10 +70,35 @@ public SingularityDeleteResult enable(SingularityDisabledActionType action) { public List getDisabledActions() { List paths = new ArrayList<>(); - for (String path : getChildren(DISABLED_ACTIONS_ROOT)) { - paths.add(ZKPaths.makePath(DISABLED_ACTIONS_ROOT, path)); + for (String path : getChildren(DISABLED_ACTIONS)) { + paths.add(ZKPaths.makePath(DISABLED_ACTIONS, path)); } - return getAsync(DISABLED_ACTIONS_ROOT, paths, disabledActionTranscoder); + return getAsync(DISABLED_ACTIONS, paths, disabledActionTranscoder); + } + + public void addDisaster(SingularityDisasterType disaster) { + create(ZKPaths.makePath(ACTIVE_DISASTERS, disaster.name())); + } + + public void removeDisaster(SingularityDisasterType disaster) { + delete(ZKPaths.makePath(ACTIVE_DISASTERS, disaster.name())); + } + + public List getActiveDisasters() { + List disasterNames = getChildren(ACTIVE_DISASTERS); + List disasters = new ArrayList<>(); + for (String name : disasterNames) { + disasters.add(SingularityDisasterType.valueOf(name)); + } + return disasters; + } + + public void saveDisasterStats(SingularityDisasterStats stats) { + save(DISASTER_STATS, stats, disasterStatsTranscoder); + } + + public Optional getDisasterStats() { + return getData(DISASTER_STATS, disasterStatsTranscoder); } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityDataModule.java b/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityDataModule.java index a4ee3e3d46..e085e508c2 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityDataModule.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityDataModule.java @@ -28,7 +28,7 @@ protected void configure() { bind(ExecutorIdGenerator.class).in(Scopes.SINGLETON); bind(WebhookManager.class).in(Scopes.SINGLETON); - bind(DisabledActionManager.class).in(Scopes.SINGLETON); + bind(DisasterManager.class).in(Scopes.SINGLETON); } @Provides diff --git a/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityValidator.java b/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityValidator.java index 4a6043e3f4..6b6e59ffc3 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityValidator.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/data/SingularityValidator.java @@ -64,10 +64,10 @@ public class SingularityValidator { private final int deployIdLength; private final DeployHistoryHelper deployHistoryHelper; private final Resources defaultResources; - private final DisabledActionManager disabledActionManager; + private final DisasterManager disasterManager; @Inject - public SingularityValidator(SingularityConfiguration configuration, DeployHistoryHelper deployHistoryHelper, RequestManager requestManager, DisabledActionManager disabledActionManager) { + public SingularityValidator(SingularityConfiguration configuration, DeployHistoryHelper deployHistoryHelper, RequestManager requestManager, DisasterManager disasterManager) { this.maxDeployIdSize = configuration.getMaxDeployIdSize(); this.maxRequestIdSize = configuration.getMaxRequestIdSize(); this.allowRequestsWithoutOwners = configuration.isAllowRequestsWithoutOwners(); @@ -87,7 +87,7 @@ public SingularityValidator(SingularityConfiguration configuration, DeployHistor this.maxMemoryMbPerRequest = configuration.getMesosConfiguration().getMaxMemoryMbPerRequest(); this.maxInstancesPerRequest = configuration.getMesosConfiguration().getMaxNumInstancesPerRequest(); - this.disabledActionManager = disabledActionManager; + this.disasterManager = disasterManager; } private void checkForIllegalChanges(SingularityRequest request, SingularityRequest existingRequest) { @@ -433,8 +433,8 @@ private String getNewDayOfWeekValue(String schedule, int dayOfWeekValue) { } public void checkActionEnabled(SingularityDisabledActionType action) { - if (disabledActionManager.isDisabled(action)) { - locked(disabledActionManager.getDisabledAction(action).getMessage()); + if (disasterManager.isDisabled(action)) { + locked(disasterManager.getDisabledAction(action).getMessage()); } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/data/transcoders/SingularityTranscoderModule.java b/SingularityService/src/main/java/com/hubspot/singularity/data/transcoders/SingularityTranscoderModule.java index 51f2dd4b32..79f8d1482f 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/data/transcoders/SingularityTranscoderModule.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/data/transcoders/SingularityTranscoderModule.java @@ -12,6 +12,7 @@ import com.hubspot.singularity.SingularityDeployStatistics; import com.hubspot.singularity.SingularityDeployUpdate; import com.hubspot.singularity.SingularityDisabledAction; +import com.hubspot.singularity.SingularityDisasterStats; import com.hubspot.singularity.SingularityHostState; import com.hubspot.singularity.SingularityKilledTaskIdRecord; import com.hubspot.singularity.SingularityLoadBalancerUpdate; @@ -84,6 +85,7 @@ public void configure(final Binder binder) { bindTranscoder(binder).asJson(SingularityExpiringSkipHealthchecks.class); bindTranscoder(binder).asJson(SingularityTaskDestroyFrameworkMessage.class); bindTranscoder(binder).asJson(SingularityDisabledAction.class); + bindTranscoder(binder).asJson(SingularityDisasterStats.class); bindTranscoder(binder).asCompressedJson(SingularityDeployHistory.class); bindTranscoder(binder).asCompressedJson(SingularityDeploy.class); diff --git a/SingularityService/src/main/java/com/hubspot/singularity/resources/DisabledActionsResource.java b/SingularityService/src/main/java/com/hubspot/singularity/resources/DisastersResource.java similarity index 68% rename from SingularityService/src/main/java/com/hubspot/singularity/resources/DisabledActionsResource.java rename to SingularityService/src/main/java/com/hubspot/singularity/resources/DisastersResource.java index 0b9a3f100a..e1e61a92ff 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/resources/DisabledActionsResource.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/resources/DisastersResource.java @@ -17,43 +17,44 @@ import com.hubspot.singularity.SingularityService; import com.hubspot.singularity.SingularityUser; import com.hubspot.singularity.auth.SingularityAuthorizationHelper; -import com.hubspot.singularity.data.DisabledActionManager; +import com.hubspot.singularity.data.DisasterManager; import com.wordnik.swagger.annotations.Api; -@Path(DisabledActionsResource.PATH) +@Path(DisastersResource.PATH) @Produces(MediaType.APPLICATION_JSON) -@Api(description="Manages Singularity Deploys for existing requests", value=DisabledActionsResource.PATH) -public class DisabledActionsResource { - public static final String PATH = SingularityService.API_BASE_PATH + "/disabled-actions"; +@Api(description="Manages Singularity Deploys for existing requests", value=DisastersResource.PATH) +public class DisastersResource { + public static final String PATH = SingularityService.API_BASE_PATH + "/disasters"; - private final DisabledActionManager disabledActionManager; + private final DisasterManager disasterManager; private final SingularityAuthorizationHelper authorizationHelper; private final Optional user; @Inject - public DisabledActionsResource(DisabledActionManager disabledActionManager, SingularityAuthorizationHelper authorizationHelper, Optional user) { - this.disabledActionManager = disabledActionManager; + public DisastersResource(DisasterManager disasterManager, SingularityAuthorizationHelper authorizationHelper, Optional user) { + this.disasterManager = disasterManager; this.authorizationHelper = authorizationHelper; this.user = user; } @GET + @Path("/disabled-actions") public List disabledActions() { authorizationHelper.checkAdminAuthorization(user); - return disabledActionManager.getDisabledActions(); + return disasterManager.getDisabledActions(); } @POST - @Path("/{action}") + @Path("/disabled-actions/{action}") public void disableAction(@PathParam("action") SingularityDisabledActionType action, String message) { authorizationHelper.checkAdminAuthorization(user); - disabledActionManager.disable(action, Optional.fromNullable(message), user); + disasterManager.disable(action, Optional.fromNullable(message), user, false); } @DELETE - @Path("/{action}") + @Path("/disabled-actions/{action}") public void enableAction(@PathParam("action") SingularityDisabledActionType action) { authorizationHelper.checkAdminAuthorization(user); - disabledActionManager.enable(action); + disasterManager.enable(action); } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/resources/SingularityResourceModule.java b/SingularityService/src/main/java/com/hubspot/singularity/resources/SingularityResourceModule.java index cf6a058590..bfdcb5f653 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/resources/SingularityResourceModule.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/resources/SingularityResourceModule.java @@ -38,7 +38,7 @@ protected void configure() { bind(WebhookResource.class); bind(AuthResource.class); bind(MetricsResource.class); - bind(DisabledActionsResource.class); + bind(DisastersResource.class); switch (uiConfiguration.getRootUrlMode()) { case UI_REDIRECT: { diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDisasterDetectionPoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDisasterDetectionPoller.java index ac73e21873..52a2fdc127 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDisasterDetectionPoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDisasterDetectionPoller.java @@ -1,12 +1,25 @@ package com.hubspot.singularity.scheduler; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Optional; import com.google.inject.Inject; +import com.hubspot.singularity.MachineState; +import com.hubspot.singularity.SingularityDisabledAction; +import com.hubspot.singularity.SingularityDisabledActionType; +import com.hubspot.singularity.SingularityDisasterStats; +import com.hubspot.singularity.SingularityDisasterType; +import com.hubspot.singularity.SingularityPendingTask; +import com.hubspot.singularity.SingularitySlave; +import com.hubspot.singularity.SingularityUser; +import com.hubspot.singularity.config.DisasterDetectionConfiguration; import com.hubspot.singularity.config.SingularityConfiguration; +import com.hubspot.singularity.data.DisasterManager; import com.hubspot.singularity.data.SlaveManager; import com.hubspot.singularity.data.TaskManager; @@ -14,21 +27,23 @@ public class SingularityDisasterDetectionPoller extends SingularityLeaderOnlyPol private static final Logger LOG = LoggerFactory.getLogger(SingularityDisasterDetectionPoller.class); - private final SingularityConfiguration configuration; + private final DisasterDetectionConfiguration configuration; private final TaskManager taskManager; private final SlaveManager slaveManager; + private final DisasterManager disasterManager; @Inject - public SingularityDisasterDetectionPoller(SingularityConfiguration configuration, TaskManager taskManager, SlaveManager slaveManager) { + public SingularityDisasterDetectionPoller(SingularityConfiguration configuration, TaskManager taskManager, SlaveManager slaveManager, DisasterManager disasterManager) { super(configuration.getDisasterDetection().getRunEveryMillis(), TimeUnit.MILLISECONDS); - this.configuration = configuration; + this.configuration = configuration.getDisasterDetection(); this.taskManager = taskManager; this.slaveManager = slaveManager; + this.disasterManager = disasterManager; } @Override protected boolean isEnabled() { - return configuration.getDisasterDetection().isEnabled(); + return configuration.isEnabled(); } @Override @@ -38,8 +53,116 @@ protected boolean abortsOnError() { @Override public void runActionOnPoll() { - + List previouslyActiveDisasters = disasterManager.getActiveDisasters(); + Optional lastStats = disasterManager.getDisasterStats(); + SingularityDisasterStats newStats = collectNewStats(lastStats); + updateActiveDisastersAndDisabledActions(previouslyActiveDisasters, checkStats(lastStats, newStats)); + disasterManager.saveDisasterStats(newStats); } + private SingularityDisasterStats collectNewStats(Optional lastStats) { + long now = System.currentTimeMillis(); + + // Pending and active task data + int numActiveTasks = taskManager.getNumActiveTasks(); + List pendingTasks = taskManager.getPendingTasks(); + int numPendingTasks = pendingTasks.size(); + int numOverdueTasks = 0; + long totalTaskLagMillis = 0; + int numPastDueTasks = 0; + + for (SingularityPendingTask pendingTask : pendingTasks) { + long taskLagMillis = now - pendingTask.getPendingTaskId().getNextRunAt(); + if (taskLagMillis > configuration.getConsiderOverdueAfterMillis()) { + numOverdueTasks++; + } + if (taskLagMillis > 0) { + numPastDueTasks++; + totalTaskLagMillis += taskLagMillis; + } + } + + long avgTaskLagMillis = totalTaskLagMillis / numPastDueTasks; + + // Active/Inactive slave data + List slaves = slaveManager.getObjects(); + int numRunningSlaves = 0; + int numLostSlaves = 0; + for (SingularitySlave slave : slaves) { + if (slave.getCurrentState().getState() != MachineState.DEAD && slave.getCurrentState().getState() != MachineState.MISSING_ON_STARTUP) { + numRunningSlaves++; + } else { + if (now - slave.getCurrentState().getTimestamp() < now - configuration.getCheckLostSlavesInLastMillis()) { + numLostSlaves ++; + } + } + } + + + return new SingularityDisasterStats(now, numActiveTasks, numPendingTasks, numOverdueTasks, avgTaskLagMillis, numRunningSlaves, numLostSlaves); + } + + private void updateActiveDisastersAndDisabledActions(List previouslyActiveDisasters, List newActiveDisasters) { + for (SingularityDisasterType disaster : previouslyActiveDisasters) { + if (!newActiveDisasters.contains(disaster)) { + disasterManager.removeDisaster(disaster); + } + } + + for (SingularityDisasterType disaster : newActiveDisasters) { + disasterManager.addDisaster(disaster); + } + + if (!newActiveDisasters.isEmpty()) { + addDisabledActions(newActiveDisasters); + } else { + clearSystemGeneratedDisabledActions(); + } + } + + private void addDisabledActions(List newActiveDisasters) { + String message = String.format("Active disasters detected: (%s)", newActiveDisasters); + for (SingularityDisabledActionType action : configuration.getDisableActionsOnDisaster()) { + disasterManager.disable(action, Optional.of(message), Optional.absent(), true); + } + } + + private void clearSystemGeneratedDisabledActions() { + for (SingularityDisabledAction disabledAction : disasterManager.getDisabledActions()) { + if (disabledAction.isSystemGenerated()) { + disasterManager.enable(disabledAction.getType()); + } + } + } + + private List checkStats(Optional lastStats, SingularityDisasterStats newStats) { + List activeDisasters = new ArrayList<>(); + + if (configuration.isCheckOverdueTasks() && tooMuchTaskLag(lastStats, newStats)) { + activeDisasters.add(SingularityDisasterType.EXCESSIVE_TASK_LAG); + } + if (configuration.isCheckLostSlaves() && tooManyLostSlaves(lastStats, newStats)) { + activeDisasters.add(SingularityDisasterType.LOST_SLAVES); + } + + return activeDisasters; + } + + private boolean tooMuchTaskLag(Optional lastStats, SingularityDisasterStats newStats) { + double overdueTaskPortion = newStats.getNumOverdueTasks() / (newStats.getNumActiveTasks() + newStats.getNumPendingTasks()); + boolean criticalOverdueTasksPortion = overdueTaskPortion > configuration.getCriticalOverdueTaskPortion(); + boolean criticalAvgTaskLag = newStats.getAvgTaskLagMillis() > configuration.getCriticalAvgTaskLagMillis(); + + if (configuration.isRequireAllConditionsForOverdueTaskDisaster()) { + return criticalAvgTaskLag && criticalOverdueTasksPortion; + } else { + return criticalAvgTaskLag || criticalOverdueTasksPortion; + } + } + + private boolean tooManyLostSlaves(Optional lastStats, SingularityDisasterStats newStats) { + double lostSlavesPortion = newStats.getNumLostSlaves() / (newStats.getNumActiveSlaves() + newStats.getNumLostSlaves()); + return lostSlavesPortion > configuration.getCriticalLostSlavePortion(); + } } diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityTaskReconciliationPoller.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityTaskReconciliationPoller.java index b97064d710..0a550e9b8b 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityTaskReconciliationPoller.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityTaskReconciliationPoller.java @@ -4,24 +4,37 @@ import javax.inject.Singleton; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.google.inject.Inject; +import com.hubspot.singularity.SingularityDisabledActionType; import com.hubspot.singularity.config.SingularityConfiguration; +import com.hubspot.singularity.data.DisasterManager; @Singleton public class SingularityTaskReconciliationPoller extends SingularityLeaderOnlyPoller { + private static final Logger LOG = LoggerFactory.getLogger(SingularityTaskReconciliationPoller.class); + private final SingularityTaskReconciliation taskReconciliation; + private final DisasterManager disasterManager; @Inject - SingularityTaskReconciliationPoller(SingularityConfiguration configuration, SingularityTaskReconciliation taskReconciliation) { + SingularityTaskReconciliationPoller(SingularityConfiguration configuration, SingularityTaskReconciliation taskReconciliation, DisasterManager disasterManager) { super(configuration.getStartNewReconcileEverySeconds(), TimeUnit.SECONDS); this.taskReconciliation = taskReconciliation; + this.disasterManager = disasterManager; } @Override public void runActionOnPoll() { - taskReconciliation.startReconciliation(); + if (disasterManager.isDisabled(SingularityDisabledActionType.TASK_RECONCILIATION)) { + LOG.warn("Not starting implicit task reconciliation: {}", disasterManager.getDisabledAction(SingularityDisabledActionType.TASK_RECONCILIATION).getMessage()); + } else { + taskReconciliation.startReconciliation(); + } } } diff --git a/SingularityUI/app/actions/api/disabledActions.es6 b/SingularityUI/app/actions/api/disabledActions.es6 index a096651a2b..4f4cdee4b4 100644 --- a/SingularityUI/app/actions/api/disabledActions.es6 +++ b/SingularityUI/app/actions/api/disabledActions.es6 @@ -2,14 +2,14 @@ import { buildApiAction, buildJsonApiAction } from './base'; export const FetchDisabledActions = buildApiAction( 'FETCH_DISABLED_ACTIONS', - {url: '/disabled-actions'} + {url: '/disasters/disabled-actions'} ); export const DeleteDisabledAction = buildJsonApiAction( 'DELETE_DISABLED_ACTION', 'DELETE', (type) => ({ - url: `/disabled-actions/${ type }` + url: `/disasters/disabled-actions/${ type }` }) ); @@ -17,7 +17,7 @@ export const NewDisabledAction = buildJsonApiAction( 'NEW_DISABLED_ACTION', 'POST', (type, message) => ({ - url: `/disabled-actions/${ type }`, + url: `/disasters/disabled-actions/${ type }`, body: message }) );