Merge pull request #1309 from teharrison/develop

repo cleanup, bug fixes, function lookup
MG-RAST · Dec 20, 2017 · 2830180 · 2830180
2 parents 6b681ce + 8626fba
commit 2830180
Show file tree

Hide file tree

Showing 9 changed files with 263 additions and 533 deletions.
diff --git a/src/MGRAST/bin/clean_taxonomy.py b/src/MGRAST/bin/clean_taxonomy.py
diff --git a/src/MGRAST/bin/delete_job_and_files.pl b/src/MGRAST/bin/delete_job_and_files.pl
@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+
+###### cleanup tool ######
+# given AWE ID
+# 1. delete job from AWE
+# 2. delete all output files in shock
+
+use lib "/MG-RAST/conf";
+
+use strict;
+use warnings;
+no warnings('once');
+
+use Conf;
+use JSON;
+use LWP::UserAgent;
+use HTTP::Request::Common;
+
+my $shock_url  = $Conf::shock_url;
+my $awe_url    = $Conf::awe_url;
+my $auth_token = $Conf::pipeline_token;
+
+my $aweid = shift @ARGV;
+
+unless ($aweid) {
+    print STDERR "Usage: \tdelete_job_and_files.pl <awe_id> \n";
+    exit 1;
+}
+
+# set handels
+my $agent = LWP::UserAgent->new();
+$agent->timeout(3600);
+my $json = JSON->new;
+$json = $json->utf8();
+$json->max_size(0);
+$json->allow_nonref;
+
+# get job document
+my $response = undef;
+my $job_doc  = undef;
+eval {
+    my $get = $self->agent->get($awe_url.'/job/'.$aweid, 'Authorization', $auth_token);
+    $response = $self->json->decode( $get->content );
+};
+if ($@ || (! ref($response))) {
+    print STDERR "ERROR: unable to connect to AWE server\n";
+} elsif (exists($response->{error}) && $response->{error}) {
+    print STDERR "ERROR: ".$response->{error}[0]."\n";
+} else {
+    $job_doc = $response->{data};
+}
+
+# delete job
+system("curl -X DELETE -H 'Authorization: $auth_token' '$awe_url/job/$aweid?full=1'; echo");
+
+# delete nodes
+foreach my $task (@{$job_doc->{tasks}}) {
+    foreach my $out (@{$task->{outputs}}) {
+        if ($out->{node} && ($out->{node} ne '-')) {
+            system("curl -X DELETE -H 'Authorization: $auth_token' '$shock_url/node/".$out->{node}."'; echo");
+        }
+    }
+}
+
+exit 0;
diff --git a/src/MGRAST/bin/parse_obo.py b/src/MGRAST/bin/parse_obo.py
@@ -10,7 +10,6 @@
 # declare a blank dictionary, keys are the term_ids
 terms = {}
 quote = re.compile(r'\"(.+?)\"')
-term  = re.compile(r'^[A-Z]+:\d+$')
 rank  = re.compile(r'^has_rank NCBITaxon:(.+)$')
 # to check for circular recursion
 ascSeen  = set()
@@ -115,7 +114,7 @@ def getParents(tid, full=False):
 def getTop(full=False):
     top = {} if full else []
     for t, info in terms.iteritems():
-        if (len(info['parentNodes']) == 0) and (len(info['childNodes']) > 0) and term.match(t):
+        if (len(info['parentNodes']) == 0) and (len(info['childNodes']) > 0):
             if full:
                 top[t] = terms[t]
             else:
@@ -136,7 +135,7 @@ def outputTab(data, ofile):
         print out_str
 
 def main(args):
-    global terms, addRank
+    global terms
     parser = OptionParser(usage="usage: %prog [options] -i <input file> -o <output file>")
     parser.add_option("-i", "--input", dest="input", default=None, help="input .obo file")
     parser.add_option("-o", "--output", dest="output", default=None, help="output: .json file or stdout, default is stdout")
@@ -150,16 +149,16 @@ def main(args):
     parser.add_option("", "--rank", dest="rank", action="store_true", default=False, help="return output with 'rank' field, only for --full")
     parser.add_option("", "--common", dest="common", action="store_true", default=False, help="use only common name synonyms (--full / NCBI taxonomy)")
     parser.add_option("", "--no_id", dest="no_id", action="store_true", default=False, help="remove 'id' from struct to reduce size, only for --full")
+    parser.add_option("", "--strip_prefix", dest="strip_prefix", action="store_true", default=False, help="remove prefix from 'id' to reduce size")
     parser.add_option("", "--no_parents", dest="no_parents", action="store_true", default=False, help="remove 'parentNodes' from struct to reduce size, only for --full")
+    parser.add_option("", "--no_description", dest="no_description", action="store_true", default=False, help="remove 'description' from struct to reduce size, only for --full")
     (opts, args) = parser.parse_args()
     if not (opts.input and os.path.isfile(opts.input)):
         parser.error("missing input")
     if not opts.relations:
         parser.error("missing relations")
     if (not opts.term_id) and (opts.get != 'top'):
         opts.get = 'all'
-    if opts.rank:
-        addRank = True
 
     oboFile = open(opts.input, 'r')
     relations = opts.relations.split(',')
@@ -174,6 +173,8 @@ def main(args):
         term = parseTagValue(getTerm(oboFile), opts.common)
         if (len(term) != 0) and ('name' in term) and (len(term['name']) > 0):
             termID = term['id'][0]
+            if opts.strip_prefix:
+                termID = termID.split(":")[1]
             termName = term['name'][0]
             if 'def' in term:
                 termDesc = term['def'][0]
@@ -187,7 +188,11 @@ def main(args):
             termParents = []
             for rel in relations:
                 if rel in term:
-                    termParents.extend([p.split()[0] for p in term[rel]])
+                    for p in term[rel]:
+                        if opts.strip_prefix:
+                            termParents.append(p.split()[0].split(":")[1])
+                        else:
+                            termParents.append(p.split()[0])
 
             # each ID will have two arrays of parents and children
             if termID not in terms:
@@ -201,6 +206,8 @@ def main(args):
                 rval = rank.match(term['property_value'][0])
                 if rval:
                     terms[termID]['rank'] = rval.group(1)
+                    if terms[termID]['rank'] == 'superkingdom':
+                        terms[termID]['rank'] = 'domain'
 
             # append parents of the current term
             terms[termID]['parentNodes'] = termParents
@@ -241,6 +248,9 @@ def main(args):
         if opts.no_parents:
             for v in data.itervalues():
                 del v['parentNodes']
+        if opts.no_description:
+            for v in data.itervalues():
+                del v['description']
 
     # have global info
     if opts.full and opts.metadata:
@@ -249,7 +259,12 @@ def main(args):
             mdata['nodes'] = data
             outputJson(mdata, opts.output)
         except:
-            outputJson(data, opts.output)
+            # default action
+            mdata = {
+                'nodes': data,
+                'rootNode': opts.term_id
+            }
+            outputJson(mdata, opts.output)
     # tabbed list output
     elif opts.tab and (not opts.full):
         outputTab(data, opts.output)