Skip to content

Commit

Permalink
error handling for taxonomy file and OTU table in 'classify tips' and…
Browse files Browse the repository at this point in the history
… unit tests
  • Loading branch information
JTFouquier committed Mar 6, 2015
1 parent 44d4384 commit 7d40fa5
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 23 deletions.
45 changes: 24 additions & 21 deletions ghosttree/tips/classifytips.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ def find_rep_otu_genus(otu_table_fh, taxonomy_fh, modified_otu_table_fh,
Parameters
__________
otu_table_fh : filehandle
The OTU table file handle will be an OTU table of clusters where each
line corresponds to a group of sequences denoted by their accession
numbers that clustered based on a user's desired percent similarity.
(XXXXXX if there is a duplicate, skip it, if not then do a different
code)
The OTU table filehandle will be an OTU table of clusters where each
line corresponds to a cluster of sequences represented by tab
delimited accession numbers. OTUs were clustered based on a user's
chosen percent similarity. First accession number can either be
duplicated or not duplicated depending on OTU table format.
example:
A111\tA111\tA112
A222\tA222\tA223
Expand All @@ -32,11 +32,10 @@ def find_rep_otu_genus(otu_table_fh, taxonomy_fh, modified_otu_table_fh,
columns, which are accession number and taxonomy line. The taxonomy
line must be in the format =
k__fungi;p__phylum;c__class;o__order;f__family;g__genus;s__species
There are always two underscores which is typical for "QIIME style"
taxonomy lines (cite).
There are always two underscores following the taxonomy designation
which is typical for "QIIME style" taxonomy lines (cite).
example:
A112\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Phoma;s__El
(XXXXX RAISE ERROR if incorrect format)
modified_otu_table_fh : filehandle
Table containing genus name and
modified_otu_table_NR_fh : filehandle
Expand All @@ -45,46 +44,50 @@ def find_rep_otu_genus(otu_table_fh, taxonomy_fh, modified_otu_table_fh,
Returns
_______
otu_genus_dic : dict
all_genus_dic : dict
"""
accession_taxonomy_dic = _create_taxonomy_dic(taxonomy_fh)
all_genera_list = []
otu_genus_dic = {}
all_genus_dic = {}
for line in otu_table_fh:
accession_list = line.strip().split("\t")
del accession_list[0]
accession_list = line.strip().split("\t") # line's accession list
if accession_list[0] == accession_list[1]:
del accession_list[0] # remove the duplicate if there is one
otu_genus_list = [] # changes for each OTU
for i in accession_list:
full_taxonomy_line = accession_taxonomy_dic[i]
genus = full_taxonomy_line.split(";")
genus = genus[-2]
genus = genus[3:]
genus = genus[3:].capitalize()
otu_genus_list.append(genus)
# genus_count_dic = Counter(otu_genus_list)
most_common_genus = max(set(otu_genus_list), key=otu_genus_list.count)
all_genera_list.append(most_common_genus)
# genus winner for OTU in line
all_genera_list.append(most_common_genus) # genera for **ALL lines*
accession_list_str = "\t".join(accession_list)
modified_otu_table_fh.write(genus+"\t"+accession_list_str+"\n")
if most_common_genus not in otu_genus_dic:
otu_genus_dic[most_common_genus] = accession_list
if most_common_genus not in all_genus_dic:
all_genus_dic[most_common_genus] = accession_list
else:
for i in accession_list: # not efficient
otu_genus_dic[most_common_genus].append(i)
tuple_dic = otu_genus_dic.items()
all_genus_dic[most_common_genus].append(i)
tuple_dic = all_genus_dic.items()
for i in tuple_dic:
i = list(i)
modified_otu_table_NR_fh.write(str(i)+"\n")
otu_table_fh.close()
modified_otu_table_fh.close()
modified_otu_table_NR_fh.close()
print otu_genus_dic
return otu_genus_dic
print all_genus_dic
return all_genus_dic


def _create_taxonomy_dic(taxonomy_fh):
line = ""
accession_taxonomy_dic = {}
for line in taxonomy_fh:
if "g__" not in line:
raise ValueError("Taxonomy file must contain genera %r" %
line)
accession, full_taxonomy_line = line.rstrip("\n").split("\t")
accession = accession.strip()
full_taxonomy_line = full_taxonomy_line.strip()
Expand Down
50 changes: 48 additions & 2 deletions ghosttree/tips/tests/test_classifytips.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
class TestFindRepOtuGenus(unittest.TestCase):
def setUp(self):
self.otus = StringIO(otus)
self.otus_no_duplicates = StringIO(otus_no_duplicates)
self.otus_mixed_genera_clusters = StringIO(otus_mixed_genera_clusters)
self.otus_mixed_genera_tie = StringIO(otus_mixed_genera_tie)
self.otus_same_genus_names = StringIO(otus_same_genus_names)
self.taxonomy = StringIO(taxonomy)
self.taxonomy_without_genus = StringIO(taxonomy_without_genus)
self.modfile1 = StringIO(modfile1)
self.modfile2 = StringIO(modfile2)

Expand All @@ -20,6 +23,13 @@ def test_tip_seqs_and_taxonomy_correct(self):
'Murcor': ['M1', 'M2'],
'Phoma': ['P1', 'P2']})

def test_otus_no_duplicate_accessions(self):
result = find_rep_otu_genus(self.otus_no_duplicates, self.taxonomy,
self.modfile1, self.modfile2)
self.assertDictEqual(result, {'Candida': ['C1', 'C2'],
'Murcor': ['M1', 'M2'],
'Phoma': ['P1', 'P2']})

def test_tip_seqs_with_mixed_genera(self):
result = find_rep_otu_genus(self.otus_mixed_genera_clusters,
self.taxonomy, self.modfile1,
Expand All @@ -37,11 +47,29 @@ def test_tip_seqs_with_mixed_genera_tie(self):
'Murcor': ['M1', 'M2'],
'Phoma': ['P1', 'P2']})

def test_otus_same_genus_names(self):
result = find_rep_otu_genus(self.otus_same_genus_names,
self.taxonomy, self.modfile1,
self.modfile2)
# Needs to address ties that conflict with dictionary key.
self.assertDictEqual(result, {'Candida': ['C1', 'C2', 'M1', 'C3',
'C4', 'C5', 'P1']})

def test_broken_taxonomy_file(self):
with self.assertRaises(ValueError):
list(find_rep_otu_genus(self.otus, self.taxonomy_without_genus,
self.modfile1, self.modfile2))

otus = """C1\tC1\tC2
M1\tM1\tM2
P1\tP1\tP2
"""

otus_no_duplicates = """C1\tC2
M1\tM2
P1\tP2
"""

otus_mixed_genera_clusters = """C1\tC1\tC2\tP3
M1\tM1\tM2
P1\tP1\tP2
Expand All @@ -52,18 +80,36 @@ def test_tip_seqs_with_mixed_genera_tie(self):
P1\tP1\tP2
"""

otus_same_genus_names = """C1\tC1\tC2\tM1
C3\tC3\tC4\tC5\tP1
"""


taxonomy = """P1\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Phoma;s__El
P2\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Phoma;s__El
P3\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Phoma;s__El
P2\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__phoma;s__El
P3\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__phoma;s__El
P4\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__phoma;s__El
P5\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Phoma;s__El
C1\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Candida;s__El
C2\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Candida;s__El
C3\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Candida;s__El
C4\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Candida;s__El
C5\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Candida;s__El
M1\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Murcor;s__El
M2\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Murcor;s__El
M3\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Murcor;s__El
M4\tk__Fungi;p__Asco;c__Do;o__My;f__Els;g__Murcor;s__El
"""

taxonomy_without_genus = """P1\tk__Fungi;p__As;c__Do;o__M;f__E;__Pho;s__El
P2\tk__Fungi;p__Asco;c__Do;o__My;f__Els;__Phoma;s__El
P3\tk__Fungi;p__Asco;c__Do;o__My;f__Els;__Phoma;s__El
C1\tk__Fungi;p__Asco;c__Do;o__My;f__Els;__Candida;s__El
C2\tk__Fungi;p__Asco;c__Do;o__My;f__Els;__Candida;s__El
C3\tk__Fungi;p__Asco;c__Do;o__My;f__Els;__Candida;s__El
M1\tk__Fungi;p__Asco;c__Do;o__My;f__Els;__Murcor;s__El
M2\tk__Fungi;p__Asco;c__Do;o__My;f__Els;__Murcor;s__El
"""

modfile1 = "repgenusOTUfile.txt"
modfile2 = "repgenusOTUfile_non-redundant.txt"
Expand Down

0 comments on commit 7d40fa5

Please sign in to comment.