New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better task balancing #1482

Merged
merged 73 commits into from Jun 8, 2017
Commits
Jump to file or symbol
Failed to load files and symbols.
+104 鈭77
Diff settings

Always

Just for now

Viewing a subset of changes. View all

calculate max possible task past due time based on task lag

  • Loading branch information...
darcatron committed May 31, 2017
commit 3345776757e6ebd57419a67760309dba49039777
View
@@ -4147,7 +4147,7 @@ Start a new deployment for a Request
| avgStatusUpdateDelayMs | long | optional | |
| deadRacks | int | optional | |
| pendingRequests | int | optional | |
| maxTaskLag | long | optional | |
| maxTaskLagMillis | long | optional | |
| cooldownRequests | int | optional | |
| hostStates | [Array[SingularityHostState]](#model-SingularityHostState) | optional | |
| allRequests | int | optional | |
@@ -991,7 +991,7 @@ Models:
| avgStatusUpdateDelayMs | long | optional | |
| deadRacks | int | optional | |
| pendingRequests | int | optional | |
| maxTaskLag | long | optional | |
| maxTaskLagMillis | long | optional | |
| cooldownRequests | int | optional | |
| hostStates | [Array[SingularityHostState]](models.md#model-SingularityHostState) | optional | |
| allRequests | int | optional | |
@@ -197,7 +197,6 @@ These settings should live under the "mesos" field inside the root configuration
#### Offers ####
| Parameter | Default | Description | Type |
|-----------|---------|-------------|------|
| maxMillisPastDuePerTask | 300000 (5 min) | The max milliseconds a task can be past due when scoring an offer | long |
| longRunningUsedCpuWeightForOffer | 0.30 | The weight long running tasks' cpu utilization carries when scoring an offer (should add up to 1 with longRunningUsedMemWeightForOffer) | double |
| longRunningUsedMemWeightForOffer | 0.70 | The weight long running tasks' memory utilization carries when scoring an offer (should add up to 1 with longRunningUsedCpuWeightForOffer) | double |
| freeCpuWeightForOffer | 0.30 | The weight the slave's free cpu carries when scoring an offer (should add up to 1 with freeMemWeightForOffer) | double |
@@ -24,7 +24,7 @@ public int getNumFutureTasks() {
return numFutureTasks;
}
public long getMaxTaskLag() {
public long getMaxTaskLagMillis() {
return maxTaskLag;
}
@@ -20,7 +20,7 @@
private final int lbCleanupTasks;
private final int lbCleanupRequests;
private final long maxTaskLag;
private final long maxTaskLagMillis;
private final int pendingRequests;
private final int cleaningRequests;
@@ -59,16 +59,16 @@
@JsonCreator
public SingularityState(@JsonProperty("activeTasks") int activeTasks, @JsonProperty("launchingTasks") int launchingTasks, @JsonProperty("activeRequests") int activeRequests, @JsonProperty("cooldownRequests") int cooldownRequests,
@JsonProperty("pausedRequests") int pausedRequests, @JsonProperty("scheduledTasks") int scheduledTasks, @JsonProperty("pendingRequests") int pendingRequests, @JsonProperty("lbCleanupTasks") int lbCleanupTasks,
@JsonProperty("lbCleanupRequests") int lbCleanupRequests, @JsonProperty("cleaningRequests") int cleaningRequests, @JsonProperty("activeSlaves") int activeSlaves, @JsonProperty("deadSlaves") int deadSlaves,
@JsonProperty("decommissioningSlaves") int decommissioningSlaves, @JsonProperty("activeRacks") int activeRacks, @JsonProperty("deadRacks") int deadRacks, @JsonProperty("decommissioningRacks") int decommissioningRacks,
@JsonProperty("cleaningTasks") int cleaningTasks, @JsonProperty("hostStates") List<SingularityHostState> hostStates, @JsonProperty("oldestDeploy") long oldestDeploy, @JsonProperty("numDeploys") int numDeploys,
@JsonProperty("oldestDeployStep") long oldestDeployStep, @JsonProperty("activeDeploys") List<SingularityDeployMarker> activeDeploys,
@JsonProperty("lateTasks") int lateTasks, @JsonProperty("futureTasks") int futureTasks, @JsonProperty("maxTaskLag") long maxTaskLag, @JsonProperty("generatedAt") long generatedAt,
@JsonProperty("overProvisionedRequestIds") List<String> overProvisionedRequestIds, @JsonProperty("underProvisionedRequestIds") List<String> underProvisionedRequestIds,
@JsonProperty("overProvisionedRequests") int overProvisionedRequests, @JsonProperty("underProvisionedRequests") int underProvisionedRequests, @JsonProperty("finishedRequests") int finishedRequests,
@JsonProperty("unknownRacks") int unknownRacks, @JsonProperty("unknownSlaves") int unknownSlaves, @JsonProperty("authDatastoreHealthy") Optional<Boolean> authDatastoreHealthy, @JsonProperty("minimumPriorityLevel") Optional<Double> minimumPriorityLevel,
@JsonProperty("avgStatusUpdateDelayMs") long avgStatusUpdateDelayMs) {
@JsonProperty("pausedRequests") int pausedRequests, @JsonProperty("scheduledTasks") int scheduledTasks, @JsonProperty("pendingRequests") int pendingRequests, @JsonProperty("lbCleanupTasks") int lbCleanupTasks,
@JsonProperty("lbCleanupRequests") int lbCleanupRequests, @JsonProperty("cleaningRequests") int cleaningRequests, @JsonProperty("activeSlaves") int activeSlaves, @JsonProperty("deadSlaves") int deadSlaves,
@JsonProperty("decommissioningSlaves") int decommissioningSlaves, @JsonProperty("activeRacks") int activeRacks, @JsonProperty("deadRacks") int deadRacks, @JsonProperty("decommissioningRacks") int decommissioningRacks,
@JsonProperty("cleaningTasks") int cleaningTasks, @JsonProperty("hostStates") List<SingularityHostState> hostStates, @JsonProperty("oldestDeploy") long oldestDeploy, @JsonProperty("numDeploys") int numDeploys,
@JsonProperty("oldestDeployStep") long oldestDeployStep, @JsonProperty("activeDeploys") List<SingularityDeployMarker> activeDeploys,
@JsonProperty("lateTasks") int lateTasks, @JsonProperty("futureTasks") int futureTasks, @JsonProperty("maxTaskLagMillis") long maxTaskLagMillis, @JsonProperty("generatedAt") long generatedAt,
@JsonProperty("overProvisionedRequestIds") List<String> overProvisionedRequestIds, @JsonProperty("underProvisionedRequestIds") List<String> underProvisionedRequestIds,
@JsonProperty("overProvisionedRequests") int overProvisionedRequests, @JsonProperty("underProvisionedRequests") int underProvisionedRequests, @JsonProperty("finishedRequests") int finishedRequests,
@JsonProperty("unknownRacks") int unknownRacks, @JsonProperty("unknownSlaves") int unknownSlaves, @JsonProperty("authDatastoreHealthy") Optional<Boolean> authDatastoreHealthy, @JsonProperty("minimumPriorityLevel") Optional<Double> minimumPriorityLevel,
@JsonProperty("avgStatusUpdateDelayMs") long avgStatusUpdateDelayMs) {
this.activeTasks = activeTasks;
this.launchingTasks = launchingTasks;
this.activeRequests = activeRequests;
@@ -91,7 +91,7 @@ public SingularityState(@JsonProperty("activeTasks") int activeTasks, @JsonPrope
this.lateTasks = lateTasks;
this.finishedRequests = finishedRequests;
this.futureTasks = futureTasks;
this.maxTaskLag = maxTaskLag;
this.maxTaskLagMillis = maxTaskLagMillis;
this.oldestDeploy = oldestDeploy;
this.numDeploys = numDeploys;
this.oldestDeployStep = oldestDeployStep;
@@ -217,8 +217,8 @@ public int getFutureTasks() {
return futureTasks;
}
public long getMaxTaskLag() {
return maxTaskLag;
public long getMaxTaskLagMillis() {
return maxTaskLagMillis;
}
public int getLbCleanupTasks() {
@@ -279,7 +279,7 @@ public String toString() {
", cleaningTasks=" + cleaningTasks +
", lbCleanupTasks=" + lbCleanupTasks +
", lbCleanupRequests=" + lbCleanupRequests +
", maxTaskLag=" + maxTaskLag +
", maxTaskLagMillis=" + maxTaskLagMillis +
", pendingRequests=" + pendingRequests +
", cleaningRequests=" + cleaningRequests +
", finishedRequests=" + finishedRequests +
@@ -200,8 +200,6 @@
private int maxOfferAttemptsPerTask = 0;
private long maxMillisPastDuePerTask = TimeUnit.MINUTES.toMillis(5);
private double longRunningUsedCpuWeightForOffer = 0.30;
private double longRunningUsedMemWeightForOffer = 0.70;
@@ -694,10 +692,6 @@ public int getMaxOfferAttemptsPerTask() {
return maxOfferAttemptsPerTask;
}
public long getMaxMillisPastDuePerTask() {
return maxMillisPastDuePerTask;
}
public double getLongRunningUsedCpuWeightForOffer() {
return longRunningUsedCpuWeightForOffer;
}
@@ -1090,11 +1084,6 @@ public SingularityConfiguration setMaxOfferAttemptsPerTask(int maxOfferAttemptsP
return this;
}
public SingularityConfiguration setMaxMillisPastDuePerTask(long maxMillisPastDuePerTask) {
this.maxMillisPastDuePerTask = maxMillisPastDuePerTask;
return this;
}
public SingularityConfiguration setLongRunningUsedCpuWeightForOffer(double longRunningUsedCpuWeightForOffer) {
this.longRunningUsedCpuWeightForOffer = longRunningUsedCpuWeightForOffer;
return this;
@@ -287,7 +287,7 @@ public SingularityState generateState(boolean includeRequestIds) {
return new SingularityState(activeTasks, launchingTasks, numActiveRequests, cooldownRequests, numPausedRequests, scheduledTasks, pendingRequests, lbCleanupTasks, lbCleanupRequests, cleaningRequests, activeSlaves,
deadSlaves, decommissioningSlaves, activeRacks, deadRacks, decommissioningRacks, cleaningTasks, states, oldestDeploy, numDeploys, oldestDeployStep, activeDeploys, scheduledTasksInfo.getNumLateTasks(),
scheduledTasksInfo.getNumFutureTasks(), scheduledTasksInfo.getMaxTaskLag(), System.currentTimeMillis(), includeRequestIds ? overProvisionedRequestIds : null,
scheduledTasksInfo.getNumFutureTasks(), scheduledTasksInfo.getMaxTaskLagMillis(), System.currentTimeMillis(), includeRequestIds ? overProvisionedRequestIds : null,
includeRequestIds ? underProvisionedRequestIds : null, overProvisionedRequestIds.size(), underProvisionedRequestIds.size(), numFinishedRequests, unknownRacks, unknownSlaves, authDatastoreHealthy, minimumPriorityLevel,
statusUpdateDeltaAvg.get());
}
@@ -27,6 +27,7 @@
import com.hubspot.singularity.SingularityClusterUtilization;
import com.hubspot.singularity.SingularityDeployStatistics;
import com.hubspot.singularity.SingularityPendingTaskId;
import com.hubspot.singularity.SingularityScheduledTasksInfo;
import com.hubspot.singularity.SingularitySlaveUsage.ResourceUsageType;
import com.hubspot.singularity.SingularitySlaveUsageWithId;
import com.hubspot.singularity.SingularityTask;
@@ -130,12 +131,13 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration,
while (!pendingTaskIdToTaskRequest.isEmpty() && addedTaskInLastLoop && canScheduleAdditionalTasks(taskCredits)) {
addedTaskInLastLoop = false;
double maxTaskMillisPastDue = maxTaskMillisPastDue(SingularityScheduledTasksInfo.getInfo(taskManager.getPendingTasks(), configuration.getDeltaAfterWhichTasksAreLateMillis()).getMaxTaskLagMillis());

This comment has been minimized.

@darcatron

darcatron May 31, 2017

Contributor

I think we'll need a better name for this variable. It's the max possible task lag before we decide we're going to take any offer we can get. I feel maxTaskMillisPastDue has some confusing overlap with maxTaskLag; which is the current highest lag time for pending tasks

@darcatron

darcatron May 31, 2017

Contributor

I think we'll need a better name for this variable. It's the max possible task lag before we decide we're going to take any offer we can get. I feel maxTaskMillisPastDue has some confusing overlap with maxTaskLag; which is the current highest lag time for pending tasks

for (Iterator<SingularityTaskRequestHolder> iterator = pendingTaskIdToTaskRequest.values().iterator(); iterator.hasNext();) {
SingularityTaskRequestHolder taskRequestHolder = iterator.next();
Map<SingularityOfferHolder, Double> scorePerOffer = new HashMap<>();
double minScore = minScore(taskRequestHolder.getTaskRequest(), offerMatchAttemptsPerTask, usageManager.getClusterUtilization(), System.currentTimeMillis());
double minScore = minScore(taskRequestHolder.getTaskRequest(), offerMatchAttemptsPerTask, usageManager.getClusterUtilization(), System.currentTimeMillis(), maxTaskMillisPastDue);
LOG.trace("Minimum score {} for task {}", minScore, taskRequestHolder.getTaskRequest().getPendingTask().getPendingTaskId().getId());
@@ -152,7 +154,6 @@ public SingularityMesosOfferScheduler(MesosConfiguration mesosConfiguration,
MesosUtils.getMemory(offerHolder.getOffer()), MesosUtils.getNumCpus(offerHolder.getOffer()), offerHolder.getOffer().getHostname(), maybeSlaveUsage);
if (score != 0 && score >= minScore) {
// todo: can short circuit here if score is high enough (>= .9)
scorePerOffer.put(offerHolder, score);

This comment has been minimized.

@darcatron

darcatron Mar 30, 2017

Contributor

Thought we might want to have a value that's definitely good enough to just accept instead of continue evaluating

@darcatron

darcatron Mar 30, 2017

Contributor

Thought we might want to have a value that's definitely good enough to just accept instead of continue evaluating

}
}
@@ -358,7 +359,7 @@ private double calculateScore(double longRunningMemUsedScore, double memFreeScor
}
@VisibleForTesting
double minScore(SingularityTaskRequest taskRequest, Map<String, Integer> offerMatchAttemptsPerTask, Optional<SingularityClusterUtilization> maybeUtilization, long now) {
double minScore(SingularityTaskRequest taskRequest, Map<String, Integer> offerMatchAttemptsPerTask, Optional<SingularityClusterUtilization> maybeUtilization, long now, double maxTaskMillisPastDue) {
if (!maybeUtilization.isPresent()) {
return 0.00;
}
@@ -369,7 +370,7 @@ private double calculateScore(double longRunningMemUsedScore, double memFreeScor
double minScore = memScore + cpuScore - getScoreTolerance();
minScore -= offerMatchAttemptsPerTask.getOrDefault(taskRequest.getPendingTask().getPendingTaskId().getId(), 0) / getMaxOfferAttemptsPerTask();
minScore -= millisPastDue(taskRequest, now) / (double) configuration.getMaxMillisPastDuePerTask();
minScore -= millisPastDue(taskRequest, now) / maxTaskMillisPastDue;
return Math.max(minScore, 0);
}
@@ -391,6 +392,14 @@ private long millisPastDue(SingularityTaskRequest taskRequest, long now) {
return Math.max(now - taskRequest.getPendingTask().getPendingTaskId().getNextRunAt(), 0);
}
@VisibleForTesting
double maxTaskMillisPastDue(long lag) {
long maxMillisPastDueForNoLag = TimeUnit.MINUTES.toMillis(3);
double slope = -1 * maxMillisPastDueForNoLag / (double) configuration.getDeltaAfterWhichTasksAreLateMillis();
return Math.max((slope * lag) + maxMillisPastDueForNoLag, 1.00);
}
private SingularityTask acceptTask(SingularityOfferHolder offerHolder, SingularitySchedulerStateCache stateCache, Map<String, Map<String, Integer>> tasksPerOfferPerRequest, SingularityTaskRequestHolder taskRequestHolder) {
final SingularityTaskRequest taskRequest = taskRequestHolder.getTaskRequest();
final SingularityTask task = mesosTaskBuilder.buildTask(offerHolder.getOffer(), offerHolder.getCurrentResources(), taskRequest, taskRequestHolder.getTaskResources(), taskRequestHolder.getExecutorResources());
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.