Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task alerts #721

Merged
merged 18 commits into from Oct 15, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -149,4 +149,5 @@ public String getTaskS3LogOmitPrefix() {
public void setTaskS3LogOmitPrefix(String taskS3LogOmitPrefix) {
this.taskS3LogOmitPrefix = taskS3LogOmitPrefix;
}

}
Expand Up @@ -37,6 +37,8 @@ public class IndexView extends View {

private final String taskS3LogOmitPrefix;

private final Integer warnIfScheduledJobIsRunningPastNextRunPct;

public IndexView(String singularityUriBase, String appRoot, SingularityConfiguration configuration) {
super("index.mustache");

Expand Down Expand Up @@ -72,6 +74,8 @@ public IndexView(String singularityUriBase, String appRoot, SingularityConfigura
this.commonHostnameSuffixToOmit = configuration.getCommonHostnameSuffixToOmit().or("");

this.taskS3LogOmitPrefix = configuration.getUiConfiguration().getTaskS3LogOmitPrefix();

this.warnIfScheduledJobIsRunningPastNextRunPct = configuration.getWarnIfScheduledJobIsRunningPastNextRunPct();
}

public String getAppRoot() {
Expand Down Expand Up @@ -150,6 +154,10 @@ public String getTaskS3LogOmitPrefix() {
return taskS3LogOmitPrefix;
}

public Integer getWarnIfScheduledJobIsRunningPastNextRunPct() {
return warnIfScheduledJobIsRunningPastNextRunPct;
}

@Override
public String toString() {
return "IndexView[" +
Expand All @@ -171,6 +179,7 @@ public String toString() {
", runningTaskLogPath='" + runningTaskLogPath + '\'' +
", finishedTaskLogPath='" + finishedTaskLogPath + '\'' +
", commonHostnameSuffixToOmit='" + commonHostnameSuffixToOmit + '\'' +
", warnIfScheduledJobIsRunningPastNextRunPct='" + warnIfScheduledJobIsRunningPastNextRunPct + '\'' +
']';
}
}
1 change: 1 addition & 0 deletions SingularityUI/app/assets/_index.mustache
Expand Up @@ -36,6 +36,7 @@
commonHostnameSuffixToOmit: "{{{ commonHostnameSuffixToOmit }}}",
taskS3LogOmitPrefix: "{{{ taskS3LogOmitPrefix }}}",
slaveHttpPort: {{{slaveHttpPort}}},
warnIfScheduledJobIsRunningPastNextRunPct: {{{warnIfScheduledJobIsRunningPastNextRunPct}}},
{{#slaveHttpsPort}}
slaveHttpsPort: {{{slaveHttpsPort}}}
{{/slaveHttpsPort}}
Expand Down
8 changes: 8 additions & 0 deletions SingularityUI/app/collections/Alerts.coffee
@@ -0,0 +1,8 @@
Collection = require './collection'

class Alerts extends Collection

initialize: (models) =>


module.exports = Alerts
53 changes: 52 additions & 1 deletion SingularityUI/app/controllers/TaskDetail.coffee
Expand Up @@ -7,6 +7,8 @@ TaskS3Logs = require '../collections/TaskS3Logs'
TaskFiles = require '../collections/TaskFiles'
TaskCleanups = require '../collections/TaskCleanups'
Deploys = require '../collections/Deploys'
DeployDetails = require '../models/DeployDetails'
Alerts = require '../collections/Alerts'

FileBrowserSubview = require '../views/fileBrowserSubview'
ExpandableTableSubview = require '../views/expandableTableSubview'
Expand All @@ -28,6 +30,7 @@ class TaskDetailController extends Controller
info: require '../templates/taskDetail/taskInfo'
environment: require '../templates/taskDetail/taskEnvironment'
resourceUsage: require '../templates/taskDetail/taskResourceUsage'
alerts: require '../templates/alerts'

initialize: ({@taskId, @filePath}) ->
@title @taskId
Expand All @@ -50,6 +53,8 @@ class TaskDetailController extends Controller

@collections.pendingDeploys = new Deploys state: 'pending'

@collections.alerts = new Alerts

#
# Subviews
#
Expand Down Expand Up @@ -97,6 +102,10 @@ class TaskDetailController extends Controller
model: @models.resourceUsage
template: @templates.resourceUsage

@subviews.alerts = new SimpleSubview
collection: @collections.alerts
template: @templates.alerts

#
# Getting stuff in gear
#
Expand Down Expand Up @@ -125,6 +134,47 @@ class TaskDetailController extends Controller
app.caughtError()
delete @models.resourceUsage

getAlerts: =>
alerts = []
task = @models.task
requestId = @models.task.attributes.task.taskRequest.request.id
deployId = @models.task.attributes.task.taskRequest.deploy.id

# Is this a scheduled task that has been running much longer than previous ones?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of calculating this yourself, you should use averageRuntimeMillis in the SingularityDeployStatistics object, which can be grabbed from the deploy history endpoint (.../api/history/request/REQUEST_ID/deploy/DEPLOY_ID). it would also be wise to surface warnIfScheduledJobIsRunningPastNextRunPct from https://github.com/HubSpot/Singularity/blob/master/SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java#L172 into the window.config object so that we don't have to hardcode the threshold value

if task.attributes.task.taskRequest.request.requestType == 'SCHEDULED' and task.get('isStillRunning')
deployInfo = new DeployDetails
deployId: deployId
requestId: requestId
deployPromise = deployInfo.fetch()
deployPromise.done =>
avg = deployInfo.get('deployStatistics')?.averageRuntimeMillis
current = new Date().getTime() - task.get('task').taskId.startedAt
threshold = window.config.warnIfScheduledJobIsRunningPastNextRunPct / 100
# Alert if current uptime is longer than the average * the configurable percentage
if current > (avg * threshold)
alerts.push
title: 'Warning:',
message: "This scheduled task has been running longer than <code>#{threshold}</code> times the average for the request and may be stuck.",
level: 'warning'
# Was this task killed by a decommissioning slave?
if !task.get('isStillRunning')
updates = task.get('taskUpdates')
decomMessage = updates.filter (u) =>
return u.statusMessage?.indexOf('DECOMISSIONING') != -1 and u.taskState == 'TASK_CLEANING'
killedMessage = updates.filter (u) =>
return u.taskState == 'TASK_KILLED'
if decomMessage.length > 0 and killedMessage.length > 0
alerts.push
title: 'Alert:',
message: 'This task was killed due to a slave decommissioning.',
level: 'danger'

if deployPromise
deployPromise.done =>
@collections.alerts.reset(alerts)
else
@collections.alerts.reset(alerts)

refresh: ->
@resourcesFetched = false

Expand All @@ -135,12 +185,13 @@ class TaskDetailController extends Controller
@models.task.fetch()
.done =>
@fetchResourceUsage() if @models.task.get('isStillRunning')
.success =>
@getAlerts()
.error =>
# If this 404s the task doesn't exist
app.caughtError()
app.router.notFound()


if @collections.s3Logs?.currentPage is 1
@collections.s3Logs.fetch().error =>
# It probably means S3 logs haven't been configured
Expand Down
5 changes: 5 additions & 0 deletions SingularityUI/app/templates/alerts.hbs
@@ -0,0 +1,5 @@
{{#each data}}
<div class="alert alert-{{level}}" role="alert">
<strong>{{title}}</strong> {{{message}}}
</div>
{{/each}}
22 changes: 13 additions & 9 deletions SingularityUI/app/templates/taskDetail/taskBase.hbs
Expand Up @@ -4,39 +4,43 @@
<div class="page-loader centered cushy"></div>
</div>

<div id="alerts" class='col-md-12'>

</div>

<div id="healthcheck-notification" class='col-md-12'>

</div>

<div id="history" class='col-md-12'>

</div>

<div id="file-browser" class='col-md-12'>

</div>

<div id='s3-logs' class='col-md-12'>

</div>

<div id='lb-updates' class='col-md-12'>

</div>

<div id='info' class='col-md-12'>

</div>

<div id='resources' class='col-md-12'>

</div>

<div id='environment' class='col-md-12'>

</div>

<div id='health-checks' class='col-md-12'>

</div>
</div>
1 change: 1 addition & 0 deletions SingularityUI/app/views/task.coffee
Expand Up @@ -18,6 +18,7 @@ class TaskView extends View

# Plop subview contents in there. It'll take care of everything itself
@$('#overview').html @subviews.overview.$el
@$('#alerts').html @subviews.alerts.$el
@$('#healthcheck-notification').html @subviews.healthcheckNotification.$el
@$('#history').html @subviews.history.$el
@$('#file-browser').html @subviews.fileBrowser.$el
Expand Down
3 changes: 2 additions & 1 deletion SingularityUI/config.coffee
Expand Up @@ -28,7 +28,7 @@ exports.config =

# When running SingularityUI via brunch server we need to make an index.html for it
# based on the template that's shared with SingularityService
#
#
# After we compile the static files, compile index.html using some required configs
onCompile: =>
destination = path.resolve @config.paths.public, 'index.html'
Expand All @@ -54,6 +54,7 @@ exports.config =
finishedTaskLogPath: process.env.SINGULARITY_FINISHED_TASK_LOG_PATH ? "stdout"
commonHostnameSuffixToOmit: process.env.SINGULARITY_COMMON_HOSTNAME_SUFFIX_TO_OMIT ? ""
taskS3LogOmitPrefix: process.env.SINGULARITY_TASK_S3_LOG_OMIT_PREFIX ? ''
warnIfScheduledJobIsRunningPastNextRunPct: process.env.SINGULARITY_WARN_IF_SCHEDULED_JOB_IS_RUNNING_PAST_NEXT_RUN_PCT ? 200

compiledTemplate = handlebars.compile(indexTemplate)(templateData)
fs.writeFileSync destination, compiledTemplate