Merge pull request #127 from McTavishLab/relabel

Relabel
McTavishLab · Jul 8, 2020 · 5a3118a · 5a3118a
2 parents 2dddf65 + d1664ad
commit 5a3118a
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 49 deletions.
diff --git a/bin/physcraper_run.py b/bin/physcraper_run.py
@@ -26,6 +26,8 @@
 parser.add_argument("-st","--search_taxon", help="taxonomic id to constrain blast search. format ott:123 or ncbi:123. Deafult will use ingroup of tree on OpenTree, or MRCA of input tips ")
 
 parser.add_argument("-spn","--species_number", help="max number of seqs to include per species")
+parser.add_argument("-bl","--block_list", nargs='+', help="ncbi accession numbers to exclude")
+
 parser.add_argument("-tp","--trim_perc", help="minimum percentage of seqs end of alignemnts")
 parser.add_argument("-rlmax","--relative_length_max", help="max relative length of added seqs, compared to input seq len")
 parser.add_argument("-rlmin","--relative_length_min", help="min relative length of added seqs, compared to input seq len")
@@ -255,6 +257,11 @@
 else:
     boot_reps = 100
 
+if args.block_list:
+    for acc in args.block_list:
+        scraper.blocklist.append(acc)
+        sys.stdout.write("Excluding accession numbers {}\n".format(','.join(scraper.blocklist)))
+
 
 run = 1
 if args.repeat:

diff --git a/docs/mds/PhyscraperRun.md b/docs/mds/PhyscraperRun.md
@@ -123,9 +123,13 @@ You can use your own blast database, for example set up on an AWS server.
 
   -tp TRIM_PERC, --trim_perc TRIM_PERC
                         minimum percentage of seqs end of alignemnts
-  -rl RELATIVE_LENGTH, --relative_length RELATIVE_LENGTH
+  -rlmax RELATIVE_LENGTH_MAX, --relative_length_max RELATIVE_LENGTH_MAX
                         max relative length of added seqs, compared to input
+                        aln len (blast matches not within lenegth cutoffs stored in outputs/seqlen_mismatch.txt)
+  -rlmin RELATIVE_LENGTH_MIN, --relative_length_min RELATIVE_LENGTH_MIN
+                        min relative length of added seqs, compared to input
                         aln len
+                        (blast matches not within lenegth cutoffs stored in outputs/seqlen_mismatch.txt)
   -spn SPECIES_NUMBER, --species_number SPECIES_NUMBER
                         max number of seqs to include per species
 

diff --git a/docs/mds/Tutorial.md b/docs/mds/Tutorial.md
@@ -90,13 +90,14 @@ The structure consists of:
     -- blast results for each tip in both the tree and the alignment
 
 -  run
-   -- This is where intermediate processing files, and the json formatted otu information are stored
+   -- This is where intermediate processing files, and the json formatted otu information are stored. Many fo tehse files are re-used ifthe crashes and is restarted. Make sure you use a new output directory or otherwise empty this folder if you want to change run parameters.
 
 - outputs
    -- final tree and alignment
 
    -- CSV file with information about each sequence
 
+   -- By default sequences less than 80% or greater than 120% (these values are configurable) of the average input sequence length are not added to the alignment, as they can cause alignment errors. The acession numbers, taxa, and sequence lengths are written out to seqlen_mismatch.txt.
 
 
 ### Compare your new tree to existing relationships

diff --git a/physcraper/aligntreetax.py b/physcraper/aligntreetax.py
@@ -121,13 +121,18 @@ def write_otu_file(treetax, filepath, schema="table"):
     if schema == "json":
         with open(filepath, "w") as outfile:
             json.dump(treetax.otu_dict, outfile)
+    leaves =[tax.label for tax, seq in treetax.aln.items()]
     if schema == "table":
         keys = ['^ot:ottTaxonName','^ot:ottId','^ncbi:taxon','^ncbi:accession','^physcraper:last_blasted','^physcraper:ingroup','^physcraper:status','^ot:originalLabel','^ncbi:title']
-        header = ["otu_id"] + keys
+        header = ["otu_id"] + keys + ['^physcraper:in_current_aln']
         with open(filepath, "w") as outfile:
             outfile.write("\t".join(header)+"\n")
             for otu in treetax.otu_dict:
                 vals = [str(treetax.otu_dict[otu].get(key, "-")) for key in keys]
+                if otu in leaves:
+                    vals.append('True')
+                else:
+                    vals.append('False')
                 outfile.write("\t".join([to_string(otu)]+vals)+"\n")
 
 
@@ -149,7 +154,7 @@ def write_labelled_tree(treetax, label, filepath, schema = "newick", norepeats=T
         """
         #debug("write labelled files")
         assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
-                         "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
+                         "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon", '^ncbi:accession']
         tmp_newick = treetax.tre.as_string(schema="newick")
         tmp_tre = Tree.get(data=tmp_newick,
                            schema="newick",
@@ -193,7 +198,7 @@ def write_labelled_aln(aligntreetax, label, filepath, schema = "fasta", norepeat
         """
         #debug("write labelled files")
         assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
-                         "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
+                         "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon", '^ncbi:accession']
         tmp_fasta = aligntreetax.aln.as_string(schema="fasta")
         tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta,
                                          schema="fasta")

diff --git a/physcraper/ids.py b/physcraper/ids.py
@@ -122,7 +122,6 @@ def get_ncbiid_from_acc(self, acc):
 
 
  #removed function find_tax_id because it wasn't being used
-
     def get_tax_seq_acc(self, acc):
         if not os.path.exists(self.full_seq_path):
             os.makedirs(self.full_seq_path)
@@ -142,7 +141,7 @@ def get_tax_seq_acc(self, acc):
             #try:
             assert(header.split()[1].startswith('taxname:'))
             tax_name = header.split()[1].strip('taxname:')
-            ncbi_id = header.split()[2].strip('ncbi_id:')
+            ncbi_id = header.split()[2].strip('ncbi:')
             seq = "".join(fi.readlines())
 #            except IndexError:
  #               print("IndexError")
@@ -204,7 +203,7 @@ def entrez_efetch(self, gb_id):
         tries = 10
         Entrez.email = self.config.email
         if self.config.api_key:
-            Entrez.api_key = self.config.apikey
+            Entrez.api_key = self.config.api_key
         handle = None
 
         # method needs delay because of ncbi settings

diff --git a/physcraper/scrape.py b/physcraper/scrape.py
@@ -77,7 +77,7 @@ class PhyscraperScrape(object):
           * **self.date**: Date of the run - may lag behind real date!
           * **self.repeat**: either 1 or 0, it is used to determine if we continue updating the tree, no new seqs found = 0
           * **self.newseqs_acc**: list of all gi_ids that were passed into remove_identical_seq(). Used to speed up adding process
-          * **self.blacklist**: list of gi_id of sequences that shall not be added or need to be removed. Supplied by user.
+          * **self.blocklist**: list of gi_id of sequences that shall not be added or need to be removed. Supplied by user.
           * **self.seq_filter**: list of words that may occur in otu_dict.status and which shall not be used in the building of FilterBlast.sp_d (that's the main function), but it is also used as assert statement to make sure unwanted seqs are not added.
           * **self.unpublished**: True/False. Used to look for local unpublished seq that shall be added if True.
           * **self.path_to_local_seq:** Usually False, contains path to unpublished sequences if option is used.
@@ -150,7 +150,7 @@ def __init__(self, data_obj, ids_obj=None, search_taxon=None):
         self.threshold = self.config.spp_threshold
 #markers for status
 #holders for new data
-        self.blacklist = []
+        self.blocklist = []
 
     def map_taxa_to_ncbi(self):
         for otu in self.data.otu_dict:
@@ -719,13 +719,17 @@ def remove_identical_seqs(self):
 #            debug("gb_id is {}".format(gb_id) )
             assert seq
             if seq_len_min < len(seq) < seq_len_max:
-                if self.blacklist is not None and gb_id in self.blacklist:
-                    debug("gb_id {} in blacklist, not added".format(gb_id))
+                if self.blocklist is not None and gb_id in self.blocklist:
+                    debug("gb_id {} in blocklist, not added".format(gb_id))
                     pass
                 else:
                     otu_id = self.data.add_otu(gb_id, self.ids)
                     tmp_dict = self.seq_dict_build(seq, otu_id, tmp_dict)
             else:
+                lr = open("{}/seqlen_mismatch.txt".format(self.outputsdir),"a")
+                taxid,taxname, seq = self.ids.get_tax_seq_acc(gb_id)
+                lr.write("taxon: {}, ncbi: {}, acc: {}, len: {}\n".format(taxname, taxid, gb_id, len(seq)))
+                lr.close()
                 debug("\nlen {}:{} was not between {} and {}\n".format(gb_id, len(seq), seq_len_min, seq_len_max))
         otu_in_aln = set([taxon.label for taxon in self.data.aln])
         for otu in otu_in_aln:
@@ -781,6 +785,8 @@ def filter_seqs(self, tmp_dict, type="random", threshold=None):
         for otu in tmp_dict:
             if otu in selected_otus:
                 filtered_dict[otu] = tmp_dict[otu]
+            else:
+                self.data.otu_dict[otu]['^physcraper:status'] = "removed in sampling down to {} per spp.".format(self.threshold)
         return filtered_dict
 
 
@@ -851,13 +857,9 @@ def select_seq_by_length(self, otu_list, seq_dict, count):
             lens.append(len(seq_dict[otu]))
         lens.sort(reverse=True)
         cutoff = lens[count]
-#        debug("cutoff is {}".format(cutoff))
-#        debug("lengths is {}".format(lens))
         selected_otus = []
         for otu in otu_len_dict:
-#            debug("otu {} has len {}".format(otu, otu_len_dict[otu]))
             if otu_len_dict[otu] >= cutoff:
-#                debug("selected!")
                 selected_otus.append(otu)
                 if len(selected_otus) == count:
                     return selected_otus
@@ -885,31 +887,6 @@ def select_seq_at_random(self, otu_list, count):
         return selected_otus
 
 
-
-#    def pickle_dump(self, filename=None, recursion=100000):
-#        """writes out class to pickle file.
-#        We need to increase the recursion depth here, as it currently fails with the standard run.
-#
-#        :param filename: optional filename
-#        :param recursion: pickle often failed with recursion depth, that's why it's increased here
-#        :return: writes out file
-#        """
-#        current = sys.getrecursionlimit()
-#        sys.setrecursionlimit(recursion)#
-#
-#        if filename:
-#            ofi = open(filename, "wb")
-#        else:
-#            ofi = open("{}/scrape_checkpoint.p".format(self.workdir), "wb")
-#        pickle.dump(self, ofi, pickle.HIGHEST_PROTOCOL)
-#        sys.setrecursionlimit(current)
-
-    def dump(self):
-        self.data.write_otus('otu_dict.json')
-        with open("{}/{}".format(self.rundir, 'config.json'), "w") as outfile:
-                json.dump(self.config.__dict__, outfile)
-
-
     def write_new_seqs(self, filename='date'):
         """writes out the query sequence file"""
         debug("write query seq")
@@ -1230,16 +1207,16 @@ def calculate_final_tree(self, boot_reps = 100):
 
 
 
-    def remove_blacklistitem(self):
-        """This removes items from aln, and tree, if the corresponding Genbank identifer were added to the blacklist.
+    def remove_blocklistitem(self):
+        """This removes items from aln, and tree, if the corresponding Genbank identifer were added to the blocklist.
 
         Note, that seq that were not added because they were similar to the one being removed here, are lost
         (that should not be a major issue though, as in a new blast_run, new seqs from the taxon can be added.)
         """
         for tax in self.data.aln.taxon_namespace:
             gi_id = self.data.otu_dict[tax.label].get("^ncbi:gi")
             acc = self.data.otu_dict[tax.label].get("^ncbi:accession")
-            if gi_id in self.blacklist or acc in self.blacklist:
+            if gi_id in self.blocklist or acc in self.blocklist:
                 self.data.remove_taxa_aln_tre(tax.label)
                 self.data.otu_dict[tax.label]['^physcraper:status'] = "deleted, Genbank identifier is part of blocklist"
         # this should not need to happen here: prune_short; instead...

diff --git a/tests/test_multi.py b/tests/test_multi.py
@@ -117,6 +117,26 @@ def test_remove_identical_seqs():
     assert len(scraper.new_seqs_otu_id) == 14
     #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test.
 
+
+def test_blocklist():
+    ids = copy.deepcopy(ids_base)
+    data_obj = copy.deepcopy(data_obj_base)
+    # print("start")
+    scraper = PhyscraperScrape(data_obj, ids)
+    scraper.blocklist.append('JX895264.1')
+    scraper._blasted = 1
+    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
+    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
+    scraper.read_blast_wrapper(blast_dir=blast_dir)
+    #print scraper.ncbi_mrca
+    assert(len(scraper.new_seqs) == 0)
+    assert(len(scraper.data.aln) == 5)
+    assert len(scraper.new_seqs_otu_id) == 13
+    aln_path1 = scraper.data.write_aln()
+    scraper.align_new_seqs()
+    assert len(scraper.data.aln) == 18
+    os.remove(aln_path1)
+
 #TODO find an example where we do get identical sequences and need to discard them
 
 
@@ -142,9 +162,7 @@ def test_remove_identical_seqs():
  #       status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
  #       assert(status in ('original', 'query'))
 
-    aln_path1 = scraper.data.write_aln()
-    scraper.align_new_seqs()
-    assert len(scraper.data.aln) == 19
+
 
 
 
@@ -277,7 +295,7 @@ def test_run_align():
     # scraper.generate_streamed_alignment()
     #assert os.path.exists("{}/RAxML_bestTree.{}".format(scraper.workdir, scraper.date))
 
-test_run_align()
+
 
 @mark.xfail
 def test_mpi():
@@ -390,4 +408,5 @@ def test_write_labelled():
     assert os.path.isfile('tests/data/tmp/otu_info_test.json')
     assert os.path.isfile('tests/data/tmp/otu_info_test.csv')
     os.remove('tests/data/tmp/otu_info_test.json')
-    os.remove('tests/data/tmp/otu_info_test.csv')
+    os.remove('tests/data/tmp/otu_info_test.csv')
+