# Autometa hyrbid O.O.P. and Functional Approach

Many different approaches have been outlined in the NSF Career Proposal. These involve updates/new algorithms for:

- taxonomic assignment
- binning methods
- pangenome binning
- ...

In [3]:
# Current location
!echo "current directory $(pwd)"
!echo "directory contents:"
!ls autometa

current directory /Users/patron/autometa
directory contents:
__init__.py     [34mconfig[m[m          dependencies.py project.py      user.py
[34m__pycache__[m[m     dataset.py      mag.py          taxonomy.py


# Configuration 

## Checking databases and dependencies all in one place.

A config file is used containing relevant database information that may easily be inspected, extended dependending on future development. i.e. using multiple marker sets or automatically retrieving database files required that the user has not already downloaded/formatted.

In [5]:
!cat autometa/config/default.cfg

[common]
home_dir = /Users/patron/autometa
db_dir = ${common:home_dir}/databases
ncbi_dir = ${common:db_dir}/ncbi
markers_dir = ${common:db_dir}/markers

[database_urls]
taxdump = ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
accession2taxid = ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz
nr = ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz

[checksums]
acc2taxid = ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5
taxdump = ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5

[ncbi]
names = ${common:ncbi_dir}/names.dmp
nodes = ${common:ncbi_dir}/nodes.dmp
accession2taxid = ${common:ncbi_dir}/prot.accession2taxid.gz
blastdb = ${common:ncbi_dir}/nr.dmnd

[markers]
# Marker sets should be in the format of kingdom.marker_set_name.[cutoffs,hmm]
bacteria_single_copy = ${common:markers_dir}/bacteria.single_copy.hmm
bacteria_single_copy_cutoffs = ${common:markers_dir}/bacteria.single_copy.cutoff

### Example of using the Autometa Dependencies class

1. Checking all executables are available
2. Checking paths to required database files

In [7]:
from autometa import dependencies
dependencies.check_executables(verbose=True)
# Can also easily track/debug using verbose arg
config = dependencies.load_databases('autometa/config/default.cfg')

diamond		: /Users/patron/anaconda3/envs/autometa/bin/diamond
hmmsearch		: /Users/patron/anaconda3/envs/autometa/bin/hmmsearch
hmmpress		: /Users/patron/anaconda3/envs/autometa/bin/hmmpress
hmmscan		: /Users/patron/anaconda3/envs/autometa/bin/hmmscan
prodigal		: /Users/patron/anaconda3/envs/autometa/bin/prodigal
bowtie2		: /Users/patron/anaconda3/envs/autometa/bin/bowtie2
Executable Dependencies Satisfied
Database Dependencies Satisfied


In [8]:
config.get('ncbi','blastdb')

'/Users/patron/autometa/databases/ncbi/nr.dmnd'

### Displaying some functionality of the Autometa Dataset class

#### Exploring a metagenome
1. GC
2. Coverage
3. size
4. number of sequences
6. Writing to stdout
5. etc.

#### Annotating a metagenome
1. kingdom specific
2. applying a length filter
3. Calling ORFs
4. Finding Markers with specified marker sets

In [9]:
from autometa.dataset import Dataset
# Dataset

In [10]:
Dataset?

In [11]:
# Demo file
assembly = 'test_data/demo.fasta'
# Demo output directory
output_folder = 'demo'

# Initialize the autometa dataset
autometa = Dataset(
    dbconfig=config,
    metagenome=assembly,
    outdir=output_folder,
    verbose=True,
    force=True,
    length_cutoff=3000,
)

Created output directory /Users/patron/autometa/demo


In [12]:
dir(autometa)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_config',
 '_coverage',
 '_force',
 '_is_filtered',
 '_kingdom',
 '_lca',
 '_length_cutoff',
 '_marker_set',
 '_metagenome',
 '_orfs',
 '_outdir',
 '_seqs',
 '_spades',
 '_verbose',
 'call_orfs',
 'coverage',
 'cpus',
 'cutoffs',
 'fasta',
 'filter_kingdom',
 'filter_length',
 'gc',
 'get_coverage',
 'get_gc_content',
 'get_markers',
 'hmms',
 'hmmscan_fpath',
 'is_filtered',
 'kingdom',
 'lca',
 'lca_fpath',
 'marker_set',
 'markers',
 'markers_fpath',
 'master',
 'master_fpath',
 'metagenome',
 'ncbi',
 'norfs',
 'nseqs',
 'orfs',
 'orfs_fpath',
 'pipeline',
 'prodigal_stdout',
 'seqs',
 'size',
 'table',
 'taxa',
 'ta

In [13]:
autometa.nseqs

1546

In [14]:
autometa.filter_length()

In [15]:
autometa.nseqs

18

In [16]:
autometa.orfs

{}

In [17]:
autometa.call_orfs()

18 seqs written to /Users/patron/autometa/demo/demo.filtered
RunningProdigal: prodigal -i /Users/patron/autometa/demo/demo.filtered -a /Users/patron/autometa/demo/demo.filtered.orfs.faa -p meta -m -q


In [25]:
autometa.kingdom

'bacteria'

In [26]:
autometa.hmms

'/Users/patron/autometa/databases/markers/bacteria.single_copy.hmm'

In [27]:
autometa.cutoffs

'/Users/patron/autometa/databases/markers/bacteria.single_copy.cutoffs'

In [28]:
autometa.kingdom = 'archaea'

In [29]:
autometa.kingdom

'archaea'

In [31]:
autometa.get_markers()

RunningHmmscan: hmmscan --cpu 1 --tblout /Users/patron/autometa/demo/demo.filtered.hmmscan /Users/patron/autometa/databases/markers/archaea.single_copy.hmm /Users/patron/autometa/demo/demo.filtered.orfs.faa
RunningMakeMarkerTable: python /Users/patron/autometa/pipeline/make_marker_table.py --assembly /Users/patron/autometa/demo/demo.filtered --hmm /Users/patron/autometa/databases/markers/archaea.single_copy.hmm --cutoffs /Users/patron/autometa/databases/markers/archaea.single_copy.cutoffs --processors 1 --out /Users/patron/autometa/demo/demo.filtered.markers


In [34]:
autometa.markers.describe()

Unnamed: 0,num_single_copies
count,18.0
mean,1.388889
std,4.230685
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,17.0


### Displaying some functionality of the Autometa TaxonUtils class

1. Searching for homology
2. Finding the LCA of taxids


In [35]:
from autometa.taxonomy import TaxonUtils
taxutils = TaxonUtils(config, verbose=True)

parsing nodes: 27953it [00:00, 279526.93it/s]

Processing nodes from /Users/patron/autometa/databases/ncbi/nodes.dmp


parsing names: 42408it [00:00, 424079.39it/s]  

nodes loaded
Processing names from /Users/patron/autometa/databases/ncbi/names.dmp


                                               

names loaded




In [36]:
# Compute the LCA of two taxids
taxutils.lca(1283, 1273)

Preparing data structures for LCA
unpickling /Users/patron/autometa/databases/ncbi/tour.pkl
/Users/patron/autometa/databases/ncbi/tour.pkl object unpickled
unpickling /Users/patron/autometa/databases/ncbi/level.pkl
/Users/patron/autometa/databases/ncbi/level.pkl object unpickled
unpickling /Users/patron/autometa/databases/ncbi/occurrence.pkl
/Users/patron/autometa/databases/ncbi/occurrence.pkl object unpickled
unpickling /Users/patron/autometa/databases/ncbi/sparse.pkl
/Users/patron/autometa/databases/ncbi/sparse.pkl object unpickled


1783272

In [39]:
here_is_a_number = 12
print(f'The number is : {float(here_is_a_number)}')
print(f'lca: {taxutils.lca(1283, 1287)}')

The number is : 12.0
lca: 1279


In [41]:
taxutils.parse_blastp?

In [42]:
taxutils.search_blastdb?

In [44]:
taxutils.blastdb_fpath

'/Users/patron/autometa/databases/ncbi/nr.dmnd'

In [46]:
taxutils.write_taxa()

NotImplementedError: 

In [47]:
autometa.get_coverage()

{'NODE_562_length_31692_cov_223.806': 223.806,
 'NODE_2043_length_3085_cov_219.547': 219.547,
 'NODE_2044_length_3080_cov_221.826': 221.826,
 'NODE_2045_length_3078_cov_223.065': 223.065,
 'NODE_2046_length_3074_cov_222.186': 222.186,
 'NODE_2047_length_3072_cov_222.687': 222.687,
 'NODE_2048_length_3072_cov_225.408': 225.408,
 'NODE_2049_length_3068_cov_222.83': 222.83,
 'NODE_2050_length_3046_cov_224.576': 224.576,
 'NODE_2051_length_3046_cov_224.232': 224.232,
 'NODE_2052_length_3044_cov_224.729': 224.729,
 'NODE_2053_length_3036_cov_225.316': 225.316,
 'NODE_2054_length_3034_cov_230.157': 230.157,
 'NODE_2055_length_3023_cov_226.367': 226.367,
 'NODE_2056_length_3017_cov_217.338': 217.338,
 'NODE_2057_length_3015_cov_226.655': 226.655,
 'NODE_2058_length_3011_cov_227.199': 227.199,
 'NODE_2059_length_3009_cov_227.189': 227.189}