Merge pull request #87 from McTavishLab/dev

Dev
McTavishLab · May 22, 2020 · 21007d4 · 21007d4
2 parents 0df4ad7 + 442d870
commit 21007d4
Show file tree

Hide file tree

Showing 19 changed files with 134 additions and 310 deletions.
diff --git a/bin/physcraper_run.py b/bin/physcraper_run.py
@@ -92,7 +92,7 @@
     conf.email = args.email
 
 sys.stdout.write("Configuration Settings\n")
-sys.stdout.write(conf.config_str())
+sys.stdout.write(conf.config_str()+"\n")
 
 study_id =  None
 if args.tree_link:
@@ -154,6 +154,7 @@
                                         configfile = conf)
     sys.stdout.write("{} taxa in alignment and tree\n".format(len(scraper.data.aln)))
     scraper.data.write_files()
+    scraper.data.write_files(treepath = "before_otu_{}.tre".format(scraper.data.tag), alnpath = "before_otu_{}.aln".format(scraper.data.tag))
     scraper.data.write_labelled(filename="before_labelled_{}".format(scraper.data.tag), label='^ot:ottTaxonName')
     scraper.data.write_otus(schema="json")
 

diff --git a/docs/ConfigObj.md b/docs/ConfigObj.md
@@ -17,7 +17,7 @@ Pulls out the configuration information from
   * **self.e_value_thresh**: the defined threshold for the e-value during Blast searches, check out:
                             https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ
   * **self.hitlist_size**: the maximum number of sequences retrieved by a single blast search
-  * **self.seq_len_perc**: value from 0 to 1. Defines how much shorter new seq can be compared to input
+  * **self.minlen**: value from 0 to 1. Defines how much shorter new seq can be compared to input
   * **self.get_ncbi_taxonomy**: Path to sh file doing something...
   * **self.ncbi_dmp**: path to file that has gi numbers and the corresponding ncbi tax id's
   * **self.phylesystem_loc**: defines which phylesystem for OpenTree datastore is used. 

diff --git a/docs/examples/example.config b/docs/examples/example.config
@@ -42,13 +42,10 @@ unmapped = keep
 spp_threshold = 5
 
 #This is how much shorter new sequences are allowed to be compared to your original sequence lengths when added to aln. Is used in during the process of adding new seqs as well as removing seq that are too short
-seq_len_perc = 0.8
-
-# value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
-trim_perc = 0.75
+min_length = 0.8
 
 # max length for values to add to aln
-max_len = 2.5
+max_length = 1.5
 
 #You should not need to change any of these!
 taxonomy_path = taxonomy
diff --git a/physcraper/aligntreetax.py b/physcraper/aligntreetax.py
@@ -353,7 +353,7 @@ def prune_short(self):
                             self.aln]
         avg_seqlen = sum(self.orig_seqlen) / len(self.orig_seqlen)
         #debug("average sequence length is {}".format(avg_seqlen))
-        seq_len_cutoff = avg_seqlen * self.config.seq_len_perc
+        seq_len_cutoff = avg_seqlen * self.config.minlen
         prune = []
         aln_ids = set()
         for tax, seq in self.aln.items():
@@ -376,56 +376,6 @@ def prune_short(self):
         self.check_tre_in_aln()
         self._reconciled = 1
 
-    def trim(self):
-        """ It removes bases at the start and end of alignments, if they are represented by less than the value
-        specified in the config file. E.g. 0.75 given in config means, that 75% of the sequences need to have a
-        base present
-
-        Ensures, that not whole chromosomes get dragged in. It's cutting the ends of long sequences.
-
-        has test: test_trim.py
-        """
-        # debug('in trim')
-        taxon_missingness = self.config.trim_perc
-        i = 0
-        seqlen = len(self.aln[i])
-        while seqlen == 0:
-            i = i + 1
-            seqlen = len(self.aln[i])
-        for tax in self.aln:
-            if len(self.aln[tax]) != seqlen:
-                sys.stderr.write("can't trim un-aligned inputs, moving on")
-                return
-        start = 0
-        stop = seqlen
-        cutoff = len(self.aln) * taxon_missingness
-        for i in range(seqlen):
-            counts = {"?": 0, "-": 0}
-            for tax in self.aln:
-                call = self.aln[tax][i].label
-                if call in ["?", "-"]:
-                    counts[call] += 1
-            if counts['?'] + counts['-'] <= cutoff:  # first ok column
-                start = i
-                break
-        for i in range(seqlen, 0, -1):  # previously seqlen-1, that cuts off last character of aln, I changed it.
-            counts = {'?': 0, '-': 0}
-            for tax in self.aln:
-                call = self.aln[tax][i - 1].label  # changing seqlen-1 to seqlen requires that we have here i-1
-                if call in ['?', '-']:
-                    counts[call] += 1
-            if counts['?'] + counts['-'] <= cutoff:
-                stop = i
-                break
-        # here alignment gets shortened to start:stop
-        for taxon in self.aln:
-            self.aln[taxon] = self.aln[taxon][start:stop]
-        # make sure that tre is presented in aln
-        self.check_tre_in_aln()
-        if _VERBOSE:
-            sys.stdout.write("trimmed alignment ends to < {} missing taxa, "
-                             "start {}, stop {}\n".format(taxon_missingness, start, stop))
-        return
 
 
     def check_tre_in_aln(self):
@@ -474,6 +424,54 @@ def remove_taxa_aln_tre(self, taxon_label):
         else:
             self.otu_dict[taxon_label]['^physcraper:status'] = "deleted, updated otu_dict but was never in tre or aln!"
 
+    def trim(self, min_taxon_perc):
+        """ It removes bases at the start and end of alignments, if they are represented by less than the value
+        specified. E.g. 0.75  that 75% of the sequences need to have a base present
+
+        Ensures, that not whole chromosomes get dragged in. It's cutting the ends of long sequences.
+
+        has test: test_trim.py
+        """
+        # debug('in trim')
+        i = 0
+        seqlen = len(self.aln[i])
+        while seqlen == 0:
+            i = i + 1
+            seqlen = len(self.aln[i])
+        for tax in self.aln:
+            if len(self.aln[tax]) != seqlen:
+                sys.stderr.write("can't trim un-aligned inputs, moving on")
+                return
+        start = 0
+        stop = seqlen
+        cutoff = len(self.aln) * min_taxon_perc
+        for i in range(seqlen):
+            counts = {"?": 0, "-": 0}
+            for tax in self.aln:
+                call = self.aln[tax][i].label
+                if call in ["?", "-"]:
+                    counts[call] += 1
+            if counts['?'] + counts['-'] <= cutoff:  # first ok column
+                start = i
+                break
+        for i in range(seqlen, 0, -1):  # previously seqlen-1, that cuts off last character of aln, I changed it.
+            counts = {'?': 0, '-': 0}
+            for tax in self.aln:
+                call = self.aln[tax][i - 1].label  # changing seqlen-1 to seqlen requires that we have here i-1
+                if call in ['?', '-']:
+                    counts[call] += 1
+            if counts['?'] + counts['-'] <= cutoff:
+                stop = i
+                break
+        # here alignment gets shortened to start:stop
+        for taxon in self.aln:
+            self.aln[taxon] = self.aln[taxon][start:stop]
+        # make sure that tre is presented in aln
+        self.check_tre_in_aln()
+        if _VERBOSE:
+            sys.stdout.write("trimmed alignment ends to < {} missing taxa, "
+                             "start {}, stop {}\n".format(min_taxon_perc, start, stop))
+        return
 
     def get_otu_for_acc(self, gb_id):
         if gb_id in set([self.otu_dict[otu].get("^ncbi:accession",'UNK') for otu in self.otu_dict]):

diff --git a/physcraper/configobj.py b/physcraper/configobj.py
@@ -41,7 +41,7 @@ class ConfigObj(object):
 
       * **self.e_value_thresh**: the defined threshold for the e-value during Blast searches, check out: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ
       * **self.hitlist_size**: the maximum number of sequences retrieved by a single blast search
-      * **self.seq_len_perc**: value from 0 to 1. Defines how much shorter new seq can be compared to input
+      * **self.minlen**: value from 0 to 1. Defines how much shorter new seq can be compared to input
       * **self.trim_perc**: value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
       * **self.maxlen**: max length for values to add to aln
       * **self.get_ncbi_taxonomy**: Path to sh file doing something...
@@ -86,8 +86,7 @@ def set_defaults(self):
         self.num_threads = 4
         self.delay = 90
         self.spp_threshold = 5
-        self.seq_len_perc = 0.8
-        self.trim_perc = 0.8
+        self.minlen = 0.8
         self.maxlen = 1.2
         self.url_base = None
         self.taxonomy_dir = "{}/taxonomy".format(physcraper_dir)
@@ -104,9 +103,8 @@ def config_str(self):
 delay = {delay}
 [physcraper]
 spp_threshold = {sppt}
-seq_len_perc = {perc}
-trim_perc = {t_perc}
-max_len = {max_len}
+min_length = {perc}
+max_length = {maxlen}
 taxonomy_path = {taxonomy}'''.format(
                                     email=self.email,
                                     e_val=self.e_value_thresh,
@@ -117,9 +115,8 @@ def config_str(self):
                                     nt=self.num_threads,
                                     delay=self.delay,
                                     sppt=self.spp_threshold,
-                                    perc=self.seq_len_perc,
-                                    t_perc=self.trim_perc,
-                                    max_len=self.maxlen,
+                                    perc=self.minlen,
+                                    maxlen=self.maxlen,
                                     taxonomy = self.taxonomy_dir)
         return(config_text)
     def write_file(self, workdir, filename = "run.config"):
@@ -184,16 +181,12 @@ def read_config(self, configfi, interactive):
         )       
         # #############
         # read in physcraper settings       
-        self.seq_len_perc = float(config["physcraper"]["seq_len_perc"])
-        assert 0 < self.seq_len_perc <= 1, (
-                "value `%s` is not between 0 and 1" % self.seq_len_perc
+        self.minlen = float(config["physcraper"]["min_length"])
+        assert 0 < self.minlen <= 1, (
+                "value `%s` is not between 0 and 1" % self.minlen
         )
         self.spp_threshold = int(config["physcraper"]["spp_threshold"])
-        self.trim_perc = float(config["physcraper"]["trim_perc"])
-        assert 0 < self.trim_perc < 1, (
-                "value `%s` is not between 0 and 1" % self.trim_perc
-        )
-        self.maxlen = float(config["physcraper"]["max_len"])
+        self.maxlen = float(config["physcraper"]["max_length"])
         assert 1 < self.maxlen, (
                 "value `%s` is not larger than 1" % self.maxlen
         )

diff --git a/physcraper/ids.py b/physcraper/ids.py
@@ -130,17 +130,24 @@ def get_tax_seq_acc(self, acc):
         if len(gb_id.split(".")) == 1:
             debug("accession number {} not recognized".format(gb_id))
             return None, None, None
-        seq_path = "{}/{}.json".format(self.full_seq_path, gb_id)
+        seq_path = "{}/{}_remote.fasta".format(self.full_seq_path, gb_id)
+        ncbi_id = tax_name = seq = None
         if gb_id in self.acc_tax_seq_dict:
             tax_name = self.acc_tax_seq_dict[gb_id]["taxname"]
-            ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"]
+            ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:"]
             seq = self.acc_tax_seq_dict[gb_id]["seq"]
         elif os.path.exists(seq_path):
-            self.acc_tax_seq_dict[gb_id] = json.load(open(seq_path))
-            tax_name = self.acc_tax_seq_dict[gb_id]["taxname"]
-            ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"]
-            seq = self.acc_tax_seq_dict[gb_id]["seq"]
-        else:
+            fi = open(seq_path)
+            header = fi.readline().strip('>')
+            #try:
+            assert(header.split()[1].startswith('taxname:'))
+            tax_name = header.split()[1].strip('taxname:')
+            ncbi_id = header.split()[2].strip('ncbi_id:')
+            seq = "".join(fi.readlines())
+#            except IndexError:
+ #               print("IndexError")
+  #              pass
+        if seq == None:
             read_handle = self.entrez_efetch(gb_id)
             tax_name = ncbi_data_parser.get_ncbi_tax_name(read_handle)
             ncbi_id =  ncbi_data_parser.get_ncbi_tax_id(read_handle)
@@ -149,7 +156,9 @@ def get_tax_seq_acc(self, acc):
             self.ncbiid_to_spn[ncbi_id] = tax_name
             self.acc_ncbi_dict[gb_id] = ncbi_id
             self.acc_tax_seq_dict[gb_id] = {'taxname':tax_name, "^ncbi:taxon":ncbi_id, 'seq':seq} #This is going to be a memory hog...
-            json.dump(self.acc_tax_seq_dict[gb_id], open(seq_path, 'w'))
+            with open(seq_path, 'w') as fi:
+                fi.write("> {} taxname:{} ncbi:{}\n".format(gb_id, tax_name, ncbi_id))
+                fi.write(self.acc_tax_seq_dict[gb_id]['seq'])
         assert ncbi_id is not None
         return ncbi_id, tax_name, seq
 

diff --git a/physcraper/opentree_helpers.py b/physcraper/opentree_helpers.py
@@ -323,7 +323,7 @@ def OtuJsonDict(id_to_spn, id_dict):
             else:
                 ncbi_spn = id_dict.ott_to_ncbi[ottid]
             sp_info_dict[otu_id] = {
-                "^ncbi:taxon": ncbiid,
+                "^ncbi:taxon": int(ncbiid),
                 "^ot:ottTaxonName": ottname,
                 "^ot:ottId": ottid,
                 "^ot:originalLabel": tipname,

diff --git a/physcraper/scrape.py b/physcraper/scrape.py
@@ -178,8 +178,6 @@ def run_local_blast_cmd(self, query, taxon_label, fn_path):
         outfmt = "6 sseqid staxids sscinames pident evalue bitscore sseq salltitles sallseqid"
         # outfmt = " -outfmt 5"  # format for xml file type
         # TODO query via stdin
-        #blastn -query /home/ejmctavish/projects/otapi/data_physcraper/output_pg55_local/current_blast_run/tmp.fas -db /home/ejmctavish/ncbi/localblastdb//nt -out /home/ejmctavish/projects/otapi/data_physcraper/output_pg55_local/current_blast_run/otu376447.txt  -outfmt '6 sseqid staxids sscinames pident evalue bitscore sseq salltitles sallseqid' -num_threads 2 -max_target_seqs 10 -max_hsps 10
-
         blastcmd = ["blastn",
                     "-query",
                     "{}/tmp.fas".format(abs_blastdir),
@@ -616,18 +614,18 @@ def seq_dict_build(self, seq, new_otu_label, seq_dict):
                 #debug("seq {} is shorter than {}".format(new_otu_label, otu_lab))
                 if new_seq in inc_seq:# if seq is identical and shorter
                     #debug("seq is identical and shorter")
-                    if existing_tax_id != tax_new_seq:  # different taxa
+                    if int(existing_tax_id) != int(tax_new_seq):  # different taxa
                         if _VERBOSE or _DEBUG:
-                            sys.stdout.write("\nseq {} is subsequence of {}, "
+                            sys.stdout.write("\nseq {} is identical and/or subsequence of {}, "
                                              "but different species name \n".format(new_otu_label, otu_lab))
-                        reason = "new seq added; subsequence of {}, but different taxon".format(otu_lab)
+                        reason = "new seq added; identical and/or subsequence of {}, but different taxon".format(otu_lab)
                         #debug("otu {}, tmp status {}".format(new_otu_label, reason))
                         #still should be added, but need to check other samples
                     else:  # subseq of same otu
                         reason = "seq {} is subsequence or identical to {}, not added ".format(new_otu_label, otu_lab)
                         if _VERBOSE:
                             sys.stdout.write("\n{}\n".format(reason))
-                        self.data.otu_dict[new_otu_label]['^physcraper:status'] = "subsequence, not added"
+                        self.data.otu_dict[new_otu_label]['^physcraper:status'] = "identical and/or subsequence of {}, not added".format(otu_lab)
                         #debug("{} not added, subseq of {}".format(new_otu_label, otu_lab))
                         #debug("{} was NOT added to seq_dict: {}".format(new_otu_label, reason))
                         return seq_dict
@@ -652,7 +650,7 @@ def seq_dict_build(self, seq, new_otu_label, seq_dict):
                         del seq_dict[otu_lab]
                         seq_dict[new_otu_label] = seq
                         self.data.remove_taxa_aln_tre(otu_lab)
-                        reason = "\nseq {} is supersequence of {}, {} added and {} removed ".format(new_otu_label, otu_lab, new_otu_label, otu_lab)
+                        reason = "seq {} is supersequence of {}, {} added and {} removed ".format(new_otu_label, otu_lab, new_otu_label, otu_lab)
                         if _VERBOSE or _DEBUG:
                             sys.stdout.write("\n{}\n".format(reason))
                         self.data.otu_dict[otu_lab]['^physcraper:status'] = "deleted, {} is supersequence ".format(new_otu_label)
@@ -686,7 +684,7 @@ def remove_identical_seqs(self):
         self.data.orig_seqlen = [len(self.data.aln[tax].symbols_as_string().replace("-", "").replace("N", "")) for tax in
                                  self.data.aln]
         avg_seqlen = sum(self.data.orig_seqlen) / len(self.data.orig_seqlen)  # HMMMMMMMM
-        seq_len_min = avg_seqlen * self.config.seq_len_perc
+        seq_len_min = avg_seqlen * self.config.minlen
         seq_len_max = avg_seqlen * self.config.maxlen
         #debug("we already have {}".format(all_added_gi))
         for gb_id, seq in self.new_seqs.items():
@@ -706,6 +704,7 @@ def remove_identical_seqs(self):
         for otu in otu_in_aln:
             del tmp_dict[otu]
         filter_dict = self.filter_seqs(tmp_dict, type='random', threshold = self.config.spp_threshold)
+        ##EJM TODO add to notes in OTU_DICT
         self.new_seqs_otu_id = filter_dict  # renamed new seq to their otu_ids from GI's, but all info is in self.otu_dict
         self.new_seqs = {} #Wipe clean
 #        debug("len new seqs otu dict after remove identical{}".format(len(self.new_seqs_otu_id)))
@@ -912,7 +911,6 @@ def align_new_seqs(self, aligner = 'muscle'):
             self.run_papara()
         if aligner == 'muscle':
             self.run_muscle()
-        self.data.trim()
         alnfi = self.data.write_aln()
         self.data.write_otus(schema="table")
         self.data.write_otus(schema="json")
@@ -1207,7 +1205,6 @@ def calculate_final_tree(self):
         debug("calculate final tree")
         self.data.write_files(treepath="physcraper_final_notrim.tre", alnpath="physcraper_final_notrim.fas")
         self.data.prune_short()
-        self.data.trim()
         self.data.write_files(treepath="physcraper_final_trim.tre", alnpath="physcraper_final_trim.fas")
         if os.path.exists("[]/previous_run".format(self.workdir)):
             self.est_full_tree(path="previous_run")

diff --git a/tests/data/aws.config b/tests/data/aws.config
@@ -43,33 +43,8 @@ delay = 90
 unmapped = keep
 
 #This is how much shorter new sequences are allowed to be compared to your original sequence lengths when added to aln. Is used in during the process of adding new seqs as well as removing seq that are too short
-seq_len_perc = 0.8
-
-# value that determines how many seq need to be present before the beginning and end of alignment will be trimmed
-trim_perc = 0.75
+min_length = 0.8
 
 # max length for values to add to aln
-max_len = 2.5
-
-
-#######
-## INTERNAL PHYSCRAPER SETTINGS
-#---------------------------------------------------------------------------------
-#Things below here you should not need to change!
-
-#Only required if blast location is local
-[ncbi_parser]
-nodes_fn = ./taxonomy/nodes.dmp
-names_fn = ./taxonomy/names.dmp
-
-[phylesystem]
-location = api
-#local or api, leave set to api unless you have installed phylesystem locally
-
-[taxonomy]
-#You should not need to change any of these!
-ott_ncbi = taxonomy/ott_ncbi
-get_ncbi_taxonomy = taxonomy/get_ncbi_taxonomy.sh
-ncbi_dmp = taxonomy/gi_taxid_nucl.dmp
-id_pickle = taxonomy/id_dmp.p
+max_length = 2.5