In [1]:
from pygustus import augustus
import shutil
import wget
import os

data_dir = 'data'
out_dir = 'out'
debug_dir = 'debug'

# Create required structure and 
# download data files

if os.path.exists(data_dir):
    shutil.rmtree(data_dir)
os.makedirs(data_dir)

if os.path.exists(out_dir):
    shutil.rmtree(out_dir)
os.makedirs(out_dir)

if os.path.exists(debug_dir):
    shutil.rmtree(debug_dir)
os.makedirs(debug_dir)

data_url = 'https://raw.githubusercontent.com/Gaius-Augustus/pygustus/main/tests/data/example.fa'
data_file = os.path.join(data_dir, 'example.fa')
wget.download(data_url, out=data_file)

data_url = 'https://raw.githubusercontent.com/Gaius-Augustus/pygustus/main/tests/data/genome.fa'
data_file = os.path.join(data_dir, 'genome.fa')
wget.download(data_url, out=data_file)

100% [..........................................................................] 1016673 / 1016673

'data/genome.fa'

In [2]:
# Execute a simple example and outputs the
# results on the command line.

augustus.predict(
    os.path.join(data_dir, 'example.fa'),
    species='human',
    UTR=True, softmasking=False)

Execute AUGUSTUS with given options.
# This output was generated with AUGUSTUS (version 3.4.0).
# AUGUSTUS is a gene prediction tool written by M. Stanke (mario.stanke@uni-greifswald.de),
# O. Keller, S. König, L. Gerischer, L. Romoth and Katharina Hoff.
# Please cite: Mario Stanke, Mark Diekhans, Robert Baertsch, David Haussler (2008),
# Using native and syntenically mapped cDNA alignments to improve de novo gene finding
# Bioinformatics 24: 637-644, doi 10.1093/bioinformatics/btn013
# No extrinsic information on sequences given.
# Initializing the parameters using config directory /usr/share/augustus/config/ ...
# human version. Using species specific transition matrix: /usr/share/augustus/config/species/human/human_trans_shadow_partial_utr.pbl
# Looks like data/example.fa is in fasta format.
# We have hints for 0 sequences and for 0 of the sequences in the input set.
#
# ----- prediction on sequence number 1 (length = 9453, name = HS04636) -----
#
# Predicted genes for sequence numb

In [3]:
# Execute a simple example and outputs the
# results in debug/aug_simple.gff.

out_file = os.path.join(out_dir, 'aug_simple.gff')
augustus.predict(
    os.path.join(data_dir, 'example.fa'),
    species='human',
    UTR=True, softmasking=False,
    outfile=out_file)

Execute AUGUSTUS with given options.
Output written to: out/aug_simple.gff


In [4]:
# Example for parallel execution (input file is split).
# Joined results stored in out/aug_parallel.gff.
# Debug output stored in folder debug/run_augustus_parallel.

out_file = os.path.join(out_dir, 'aug_parallel.gff')
cur_debug_dir = os.path.join(
    debug_dir, 'run_augustus_parallel')
augustus.predict(
    os.path.join(data_dir, 'example.fa'),
    species='human',
    UTR=True, softmasking=True, jobs=2,
    outfile=out_file,
    debugOutputDir=cur_debug_dir)

Execute AUGUSTUS with 2 jobs in parallel.
Joined output written to: out/aug_parallel.gff


In [5]:
# Example for parallel execution (automatically setting the AUGUSTUS
# parameters predictionStart and predictionEnd based on the given values
# for chunksize and overlap).
# Joined results stored in out/aug_parallel_on_seq.gff.
# Debug output stored in folder debug/run_augustus_parallel_on_seq.

out_file = os.path.join(out_dir, 'aug_parallel_on_seq.gff')
cur_debug_dir = os.path.join(
    debug_dir, 'run_augustus_parallel_on_seq')
augustus.predict(
    os.path.join(data_dir, 'genome.fa'),
    species='human',
    UTR=True, softmasking=True, jobs=5,
    outfile=out_file,
    partitionLargeSequences=True,
    chunksize=250000,
    overlap=50000,
    maxSeqSize=750000,
    debugOutputDir=cur_debug_dir)

Execute AUGUSTUS with 5 jobs in parallel.
Joined output written to: out/aug_parallel_on_seq.gff


In [6]:
# Show fasta file information.
augustus.show_fasta_info('data/example.fa')

9453 bases.	HS04636 BASE COUNT     2937 a   1716 c   1710 g   3090 t
2344 bases.	HS08198 BASE COUNT     400 a   730 c   778 g   436 t
summary: BASE COUNT     3337 a   2446 c   2488 g   3526 t
total 11797bp in 2 sequence(s).
gc: 41.82419259133678%


In [7]:
# Show AUGUSTUS help information
augustus.show_aug_help()

Execute AUGUSTUS with given options.
usage:
augustus [parameters] --species=SPECIES queryfilename

'queryfilename' is the filename (including relative path) to the file containing the query sequence(s)
in fasta format.

SPECIES is an identifier for the species. Use --species=help to see a list.

parameters:
--strand=both, --strand=forward or --strand=backward
--genemodel=partial, --genemodel=intronless, --genemodel=complete, --genemodel=atleastone or --genemodel=exactlyone
  partial      : allow prediction of incomplete genes at the sequence boundaries (default)
  intronless   : only predict single-exon genes like in prokaryotes and some eukaryotes
  complete     : only predict complete genes
  atleastone   : predict at least one complete gene
  exactlyone   : predict exactly one complete gene
--singlestrand=true
  predict genes independently on each strand, allow overlapping genes on opposite strands
  This option is turned off by default.
--hintsfile=hintsfilename
  When this option 

In [8]:
# Show Pygustus help
augustus.help()

usage:
augustus.predict(queryfilename, species='SPECIES', [augustus_parameters], [pygustus_parameters])

'queryfilename' is the filename (including relative path) to the file
containing the query sequence(s) in fasta format.

SPECIES is an identifier for the species.
Use augustus.show_species_info() to see a list.

augustus_parameters are all possible parameters for AUGUSTUS.
Use augustus.show_aug_help() to find more information or
augustus.show_aug_paramlist() to see a list.

pygustus_parameters:

chunksize (int)
  description:
    If this option is set and jobs > 1, each AUGUSTUS instance is executed 
    on sequence segments of the maximum size n. 
  default value: 2500000 

debugOutputDir (string)
  description:
    If the directory is specified, all generated files, i.e. the split of 
    the input file and intermediate results, as well as the generated 
    AUGUSTUS command lines are stored there. This option works only for 
    the parallelization, i. e. jobs > 1 is set. 

jobs 