Skip to content

Commit

Permalink
Fixes in email, Job.GetApplications, Job cleanup and archiving and Jo…
Browse files Browse the repository at this point in the history
…b submission with same user/group. (#402)

* GetApplications for a job returns an empty list instead of NotFoundException. Emails are more detailed.Archival and Cleanup of job directories only happens if the job runs. Fixed an edge case where job submission failed if user/group specified were the same in the jobrequest.Emails upon job completion are more detailed.

Added start and end time to genie log file for debugging.

* Code Review improvements.
  • Loading branch information
amitsharmaak authored and tgianos committed Sep 28, 2016
1 parent 7a087e7 commit e1205f5
Show file tree
Hide file tree
Showing 8 changed files with 67 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ public final class JobConstants {
.append("}\n")
.append("\n")
.append("SELF_PID=$$\n\n")
.append("echo \"Start run execution\"\n")
.append("echo Start: `date '+%Y-%m-%d %H:%M:%S'`\n")
.toString();

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,15 +203,20 @@ protected synchronized void createUser(final String user, final String group) th
} catch (final IOException ioe) {
log.debug("User does not exist. Creating it now.");

// Create the group for the user.
if (group != null) {
// Determine if the group is valid by checking that its not null and not same as user.
final boolean isGroupValid = StringUtils.isNotBlank(group) && !group.equals(user);

// Create the group for the user if its not the same as the user.
if (isGroupValid) {
log.debug("Group and User are different so creating group now.");
final CommandLine groupCreateCommandLine = new CommandLine("sudo");
groupCreateCommandLine.addArgument("groupadd");
groupCreateCommandLine.addArgument(group);

// We create the group and ignore the error as it will fail if group already exists.
// If the failure is due to some other reason, then user creation will fail and we catch that.
try {
log.debug("Running command to create group: [" + groupCreateCommandLine.toString() + "]");
this.executor.execute(groupCreateCommandLine);
} catch (IOException ioexception) {
log.debug("Group creation threw an error as it might already exist");
Expand All @@ -222,14 +227,15 @@ protected synchronized void createUser(final String user, final String group) th
userCreateCommandLine.addArgument("useradd");
userCreateCommandLine.addArgument(user);

if (StringUtils.isNotBlank(group)) {
if (isGroupValid) {
userCreateCommandLine.addArgument("-G");
userCreateCommandLine.addArgument(group);
}

userCreateCommandLine.addArgument("-M");

try {
log.debug("Running command to create user: [" + userCreateCommandLine.toString() + "]");
this.executor.execute(userCreateCommandLine);
} catch (IOException ioexception) {
throw new GenieServerException("Could not create user " + user + " with exception " + ioexception);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ public void executeTask(@NotNull final Map<String, Object> context) throws Genie
writer.write(JobConstants.GENIE_DONE_FILE_CONTENT_PREFIX
+ JobConstants.GENIE_DONE_FILE_NAME
+ System.lineSeparator());

// Print the timestamp once its done running.
writer.write("echo End: `date '+%Y-%m-%d %H:%M:%S'`\n");

log.info("Finished Job Task for job {}", jobExecEnv.getJobRequest().getId());
} finally {
final long finish = System.nanoTime();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import javax.persistence.criteria.Root;
import javax.validation.constraints.NotNull;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Set;
Expand Down Expand Up @@ -359,7 +360,7 @@ public List<Application> getJobApplications(@NotBlank final String id) throws Ge
if (applications != null && !applications.isEmpty()) {
return applications.stream().map(ApplicationEntity::getDTO).collect(Collectors.toList());
} else {
throw new GenieNotFoundException("Job " + id + " doesn't have a cluster associated with it");
return Collections.EMPTY_LIST;
}
} else {
throw new GenieNotFoundException("No job with id " + id + " exists. Unable to get cluster");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,27 +191,27 @@ public void cantGetJobApplicationsIfJobDoesNotExist() throws GenieException {
*
* @throws GenieException For any problem
*/
@Test(expected = GenieNotFoundException.class)
@Test()
public void cantGetJobApplicationsIfApplicationsDoNotExist() throws GenieException {
final String id = UUID.randomUUID().toString();
final JobEntity entity = Mockito.mock(JobEntity.class);
Mockito.when(entity.getApplications()).thenReturn(null);
Mockito.when(this.jobRepository.findOne(id)).thenReturn(entity);
this.service.getJobApplications(id);
Assert.assertTrue(this.service.getJobApplications(id).isEmpty());
}

/**
* Test the getJobApplications method.
*
* @throws GenieException For any problem
*/
@Test(expected = GenieNotFoundException.class)
@Test()
public void cantGetJobApplicationsIfApplicationsAreEmpty() throws GenieException {
final String id = UUID.randomUUID().toString();
final JobEntity entity = Mockito.mock(JobEntity.class);
Mockito.when(entity.getApplications()).thenReturn(Lists.newArrayList());
Mockito.when(this.jobRepository.findOne(id)).thenReturn(entity);
this.service.getJobApplications(id);
Assert.assertTrue(this.service.getJobApplications(id).isEmpty());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,29 +268,31 @@ private Void updateJob(final Job job, final JobFinishedEvent event, final Map<St

/**
* An external fail-safe mechanism to clean up processes left behind by the run.sh after the
* job is killed or failed.
* job is killed or failed. This method is a no-op for jobs whose status is INVALID.
*
* @param jobId The id of the job to cleanup processes for.
*/
private void cleanupProcesses(final String jobId) {
try {
this.jobSearchService.getJobExecution(jobId).getProcessId().ifPresent(pid -> {
try {
final CommandLine commandLine = new CommandLine(JobConstants.UNIX_PKILL_COMMAND);
commandLine.addArgument(JobConstants.getKillFlag());
commandLine.addArgument(Integer.toString(pid));
executor.execute(commandLine);

// The process group should not exist and the above code should always throw and exception.
// If it does not then the bash script is not cleaning up stuff well during kills
// or the script is done but child processes are still remaining. This metric tracks all that.
processGroupCleanupFailureRate.increment();
} catch (final Exception e) {
log.debug("Received expected exception. Ignoring.");
}
});
if (!this.jobSearchService.getJobStatus(jobId).equals(JobStatus.INVALID)) {
this.jobSearchService.getJobExecution(jobId).getProcessId().ifPresent(pid -> {
try {
final CommandLine commandLine = new CommandLine(JobConstants.UNIX_PKILL_COMMAND);
commandLine.addArgument(JobConstants.getKillFlag());
commandLine.addArgument(Integer.toString(pid));
executor.execute(commandLine);

// The process group should not exist and the above code should always throw and exception.
// If it does not then the bash script is not cleaning up stuff well during kills
// or the script is done but child processes are still remaining. This metric tracks all that.
processGroupCleanupFailureRate.increment();
} catch (final Exception e) {
log.debug("Received expected exception. Ignoring.");
}
});
}
} catch (final GenieException ge) {
log.error("Unable to get job execution so unable to cleanup process for job " + jobId, ge);
log.error("Unable to cleanup process for job due to exception. " + jobId, ge);
this.processGroupCleanupFailureRate.increment();
}
}
Expand Down Expand Up @@ -431,7 +433,9 @@ private boolean processJobDir(final Job job) throws GenieException, IOException
log.debug("Got a job finished event. Will process job directory.");
boolean result = false;
final Optional<String> oJobId = job.getId();
if (oJobId.isPresent()) {

// The deletion of dependencies and archiving only happens for job requests which are not Invalid.
if (oJobId.isPresent() && !(this.jobSearchService.getJobStatus(job.getId().get()).equals(JobStatus.INVALID))) {
final String jobId = oJobId.get();
final File jobDir = new File(this.baseWorkingDir, jobId);

Expand Down Expand Up @@ -493,13 +497,32 @@ private boolean sendEmail(final String jobId) throws GenieException {
final JobRequest jobRequest = this.jobSearchService.getJobRequest(jobId);
boolean result = false;
final Optional<String> email = jobRequest.getEmail();

if (email.isPresent() && !Strings.isNullOrEmpty(email.get())) {
log.debug("Got a job finished event. Sending email: {}", email.get());
final JobStatus status = this.jobSearchService.getJobStatus(jobId);

final StringBuilder subject = new StringBuilder()
.append("Genie Job Finished. Id: [")
.append(jobId)
.append("], Name: [")
.append(jobRequest.getName())
.append("], Status: [")
.append(status)
.append("].");

final StringBuilder body = new StringBuilder()
.append("Id: [" + jobId + "]\n")
.append("Name: [" + jobRequest.getName() + "]\n")
.append("Status: [" + status + "]\n")
.append("User: [" + jobRequest.getUser() + "]\n")
.append("Description: [" + jobRequest.getDescription() + "]\n")
.append("Tags: " + jobRequest.getTags() + "\n");

this.mailServiceImpl.sendEmail(
email.get(),
"Genie Job" + jobId,
"Job with id [" + jobId + "] finished with status " + status
subject.toString(),
body.toString()
);
result = true;
this.emailSuccessRate.increment();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function handle_kill_request {

SELF_PID=$$

echo "Start run execution"
echo Start: `date '+%Y-%m-%d %H:%M:%S'`

export GENIE_JOB_DIR="TEST_GENIE_JOB_WORKING_DIR_PLACEHOLDER"

Expand Down Expand Up @@ -112,3 +112,4 @@ wait $!

# Write the return code from the command in the done file.
printf '{"exitCode": "%s"}\n' "$?" > ./genie/genie.done
echo End: `date '+%Y-%m-%d %H:%M:%S'`
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function handle_kill_request {

SELF_PID=$$

echo "Start run execution"
echo Start: `date '+%Y-%m-%d %H:%M:%S'`

export GENIE_JOB_DIR="TEST_GENIE_JOB_WORKING_DIR_PLACEHOLDER"

Expand Down Expand Up @@ -112,3 +112,4 @@ wait $!

# Write the return code from the command in the done file.
printf '{"exitCode": "%s"}\n' "$?" > ./genie/genie.done
echo End: `date '+%Y-%m-%d %H:%M:%S'`

0 comments on commit e1205f5

Please sign in to comment.