Skip to content

Commit

Permalink
Merge pull request #13 from Merck/master
Browse files Browse the repository at this point in the history
Apply master changes to develop
  • Loading branch information
prihoda committed Jul 30, 2019
2 parents 9008c77 + e8cc785 commit 053679b
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 5 deletions.
2 changes: 1 addition & 1 deletion deepbgc/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.7-dev'
__version__ = '0.1.8'
2 changes: 2 additions & 0 deletions deepbgc/command/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from deepbgc.output.cluster_tsv import ClusterTSVWriter
from deepbgc.output.evaluation.pfam_score_plot import PfamScorePlotWriter
from deepbgc.output.pfam_tsv import PfamTSVWriter
from deepbgc.output.antismash_json import AntismashJSONWriter


class PipelineCommand(BaseCommand):
Expand Down Expand Up @@ -131,6 +132,7 @@ def run(self, inputs, output, detectors, no_detector, labels, classifiers, no_cl

writers = []
writers.append(GenbankWriter(out_path=os.path.join(output, output_file_name+'.full.gbk')))
#writers.append(AntismashJSONWriter(out_path=os.path.join(output, output_file_name + '.antismash.json')))
is_evaluation = False
if not is_minimal_output:
writers.append(BGCGenbankWriter(out_path=os.path.join(output, output_file_name+'.bgc.gbk')))
Expand Down
86 changes: 86 additions & 0 deletions deepbgc/output/antismash_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from deepbgc.output.writer import OutputWriter
from deepbgc import util
from deepbgc import __version__
import json
import collections

ANTISMASH_SUBREGION_LABEL_MAX_LENGTH = 20


class AntismashJSONWriter(OutputWriter):

def __init__(self, out_path):
super(AntismashJSONWriter, self).__init__(out_path)
self.record_ids = []
self.record_subregions = []
self.record_protoclusters = []
self.tool_meta = collections.OrderedDict()

@classmethod
def get_description(cls):
return 'AntiSMASH JSON file for sideloading.'

@classmethod
def get_name(cls):
return 'antismash-json'

def write(self, record):
cluster_features = util.get_cluster_features(record)
classifier_names = util.get_record_classifier_names(record)
subregions = []
protoclusters = []
for cluster in cluster_features:
subregion = self._create_cluster_json(cluster, classifier_names=classifier_names)
subregions.append(subregion)
# TODO add protocluster?

self.record_ids.append(record.id)
self.record_subregions.append(subregions)
self.record_protoclusters.append(protoclusters)
for detector_label, meta in util.get_record_detector_meta(record).items():
for k, v in meta.items():
self.tool_meta['{}_{}'.format(detector_label, k)] = str(v)

def _get_cluster_classes_str(self, cluster, classifier_name):
class_str_list = cluster.qualifiers.get(util.format_classification_column(classifier_name))
return class_str_list[0] if class_str_list else 'no confident class'

def _create_cluster_json(self, cluster, classifier_names):
classes = {cls_name: self._get_cluster_classes_str(cluster, cls_name) for cls_name in classifier_names}
tool_name = cluster.qualifiers.get('tool', ['unspecified'])[0]
detector_name = cluster.qualifiers.get('detector', [tool_name])[0]
score_column = util.format_bgc_score_column(detector_name)
score = cluster.qualifiers.get(score_column)
details = {
'detector': detector_name,
}
details.update(classes)
if score:
details['score'] = score[0]
return {
'start': int(cluster.location.start),
'end': int(cluster.location.end),
'label': 'Putative BGC',
'details': details
}

def _create_record_json(self, name, subregions, protoclusters):
return {
"name": name,
"subregions": subregions,
"protoclusters": protoclusters
}

def close(self):
zipped_records = zip(self.record_ids, self.record_subregions, self.record_protoclusters)
data = {
"tool": {
"name": "DeepBGC",
"version": __version__,
"description": "Putative BGCs predicted using DeepBGC",
"configuration": self.tool_meta
},
"records": [self._create_record_json(name, sr, pc) for name, sr, pc in zipped_records]
}
with open(self.out_path, 'w') as f:
json.dump(data, f, indent=2)
7 changes: 6 additions & 1 deletion deepbgc/output/evaluation/bgc_region_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def save_plot(self):
num_sequences = len(self.sequence_titles)
num_detectors = len(self.detector_labels)

if not num_sequences:
return

fig, axes = plt.subplots(num_sequences, 1, figsize=(15, 1 + 0.25 * (num_detectors + 2) * num_sequences))
if num_sequences == 1:
axes = [axes]
Expand All @@ -50,7 +53,9 @@ def save_plot(self):
continue

end = clusters['nucl_end'].max()
x_step = 100000
x_step = 10000
if end / x_step > 20:
x_step = 100000
if end / x_step > 20:
x_step = 200000
if end / x_step > 20:
Expand Down
9 changes: 6 additions & 3 deletions deepbgc/output/evaluation/pfam_score_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def close(self):

def save_plot(self):
num_sequences = len(self.sequence_titles)
if not num_sequences:
return
fig, axes = plt.subplots(num_sequences, 1, figsize=(15, 1+1.5*num_sequences))
if num_sequences == 1:
axes = [axes]
Expand All @@ -38,7 +40,7 @@ def save_plot(self):
axes[i].set_xlabel('')
axes[i].set_ylabel('BGC score')
axes[i].set_title(sequence_title)
x = detector_scores.index
x = detector_scores.index.values
xlim = (min(x), max(x))
axes[i].set_xlim(xlim)
if detector_scores.empty:
Expand All @@ -47,7 +49,7 @@ def save_plot(self):
# For each detector score column
color_idx = 0
for column, thresholds in zip(detector_scores.columns, sequence_thresholds):
y = detector_scores[column]
y = detector_scores[column].values
if column == 'in_cluster':
color = 'grey'
full_height_val = y * (1 + 2 * offset) - offset
Expand All @@ -56,7 +58,8 @@ def save_plot(self):
else:
color = cmap(color_idx)
color_idx += 1
axes[i].plot(x, y, lw=0.75, alpha=0.6, color=color, label=column)
marker = 'o' if len(x) == 1 else None
axes[i].plot(x, y, lw=0.75, alpha=0.6, color=color, label=column, marker=marker)
axes[i].hlines(thresholds, xlim[0], xlim[1], color=color, linestyles='--', lw=0.75, alpha=0.5)
if len(detector_scores.columns) > 1:
lgnd = axes[i].legend(bbox_to_anchor=(1.02, 1), loc='upper left')
Expand Down
Empty file added test/unit/output/__init__.py
Empty file.
109 changes: 109 additions & 0 deletions test/unit/output/test_unit_writers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from deepbgc.output.bgc_genbank import BGCGenbankWriter
from deepbgc.output.evaluation.pr_plot import PrecisionRecallPlotWriter
from deepbgc.output.evaluation.roc_plot import ROCPlotWriter
from deepbgc.output.genbank import GenbankWriter
from deepbgc.output.evaluation.bgc_region_plot import BGCRegionPlotWriter
from deepbgc.output.cluster_tsv import ClusterTSVWriter
from deepbgc.output.evaluation.pfam_score_plot import PfamScorePlotWriter
from deepbgc.output.pfam_tsv import PfamTSVWriter
from deepbgc.output.antismash_json import AntismashJSONWriter
from deepbgc import util
from deepbgc.data import PFAM_DB_VERSION
import collections
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Alphabet import generic_dna
import os
import pytest


class WriterTest:
def __init__(self, cls, path):
self.cls = cls
self.path = path

def __str__(self):
return str(self.cls)

def __repr__(self):
return str(self.cls)


WRITERS = [
BGCGenbankWriter,
PrecisionRecallPlotWriter,
ROCPlotWriter,
GenbankWriter,
BGCRegionPlotWriter,
ClusterTSVWriter,
PfamScorePlotWriter,
PfamTSVWriter,
AntismashJSONWriter
]

@pytest.fixture
def processed_record(detector_name='deepbgc', detector_label='deepbgc', score_threshold=0.5):
comment_key = util.format_detector_meta_key(detector_label)
record = SeqRecord(Seq('ACTGCTCGACTGATT', alphabet=generic_dna))
record.annotations['structured_comment'] = collections.OrderedDict()
record.annotations['structured_comment'][comment_key] = collections.OrderedDict(
name=detector_name,
label=detector_label,
score_threshold=score_threshold
)
# Add protein features
record.features.append(SeqFeature(FeatureLocation(0, 2), type='CDS', qualifiers={'locus_tag': ['A']}))
record.features.append(SeqFeature(FeatureLocation(2, 5), type='CDS', qualifiers={'locus_tag': ['B']}))
record.features.append(SeqFeature(FeatureLocation(5, 8), type='CDS', qualifiers={'locus_tag': ['C']}))
# Add pfam features
score_column = util.format_bgc_score_column(detector_name)
qualifiers = {score_column: [0.4], 'db_xref': ['PF00001'], 'locus_tag': ['A'], 'database': [PFAM_DB_VERSION]}
record.features.append(SeqFeature(FeatureLocation(0, 2), type=util.PFAM_FEATURE, qualifiers=qualifiers))
qualifiers = {score_column: [0.7], 'db_xref': ['PF00002'], 'locus_tag': ['B'], 'database': [PFAM_DB_VERSION]}
record.features.append(SeqFeature(FeatureLocation(2, 5), type=util.PFAM_FEATURE, qualifiers=qualifiers))
qualifiers = {score_column: [0.6], 'db_xref': ['PF00003'], 'locus_tag': ['C'], 'database': [PFAM_DB_VERSION]}
record.features.append(SeqFeature(FeatureLocation(5, 8), type=util.PFAM_FEATURE, qualifiers=qualifiers))
# Add BGC features
qualifiers = { score_column: ['0.6'], 'detector': [detector_name], 'detector_label': [detector_label]}
record.features.append(SeqFeature(FeatureLocation(0, 5), type='cluster', qualifiers=qualifiers))
qualifiers = { 'detector': ['annotated'], 'detector_label': ['annotated']}
record.features.append(SeqFeature(FeatureLocation(2, 8), type='cluster', qualifiers=qualifiers))
return record


@pytest.mark.parametrize("writer_cls", WRITERS)
def test_unit_writer_full_record(tmpdir, writer_cls, processed_record):
out_path = os.path.join(str(tmpdir), 'file.png')
writer = writer_cls(out_path=out_path)
writer.write(processed_record)
writer.close()
assert os.path.exists(out_path)


@pytest.mark.parametrize("writer_cls", WRITERS)
def test_unit_writer_no_record(tmpdir, writer_cls):
out_path = os.path.join(str(tmpdir), 'file.png')
writer = writer_cls(out_path=out_path)
writer.close()


@pytest.mark.parametrize("writer_cls", WRITERS)
def test_unit_writer_no_features(tmpdir, writer_cls, processed_record):
out_path = os.path.join(str(tmpdir), 'file.png')
processed_record.features = []
writer = writer_cls(out_path=out_path)
writer.write(processed_record)
writer.close()


@pytest.mark.parametrize("writer_cls", WRITERS)
def test_unit_writer_single_feature(tmpdir, writer_cls, processed_record):
out_path = os.path.join(str(tmpdir), 'file.png')
cds_features = util.get_protein_features(processed_record)
pfam_features = util.get_pfam_features(processed_record)
cluster_features = util.get_cluster_features(processed_record)
processed_record.features = cds_features[:1] + pfam_features[:1] + cluster_features[:1]
writer = writer_cls(out_path=out_path)
writer.write(processed_record)
writer.close()

0 comments on commit 053679b

Please sign in to comment.