scripts/getAnnoFastaFromJoingenes.py

#!/usr/bin/env python3

# Author: Katharina J. Hoff
# E-Mail: katharina.hoff@uni-greifswald.de
# Last modified on February 15th 2024
#
# This Python script extracts CDS features from a GTF file, excises
# corresponding sequence windows from a genome FASTA file, stitches the
# codingseq parts together, adds letters N at the ends if bases are
# annotated as missing by frame in the GTF file, makes reverse complement
# if required, and translates to protein sequence.
# Output files are:
#    * file with protein sequences in FASTA format,
#    * file with coding squences in FASTA format
# The script automatically checks for in-frame stop codons and prints a
# warning to STDOUT if such genes are in the GTF-file. The IDs of bad genes
# are printed to a file bad_genes.lst. Option -s allows to exclude bad genes
# from the FASTA output file, automatically.
# Beware: The script assumes that the gtf input file is sorted by coordinates!
#         Assumptions on AUGUSTUS-specific GTF format are made.

try:
    import argparse
except ImportError:
    raise ImportError(
        'Failed to import argparse. Try installing with \"pip3 install argparse\"')

try:
    import re
except ImportError:
    raise ImportError(
        'Failed to import argparse. Try installing with \"pip3 install re\"')

try:
    from Bio.Seq import Seq
    from Bio import SeqIO
    from Bio.SeqRecord import SeqRecord
except ImportError:
    raise ImportError(
        'Failed to import biophython modules. Try installing with \"pip3 install biopython\"')


parser = argparse.ArgumentParser(
    description='Generate *.codingseq and *.aa FASTA-format files from genes \
                 in a GTF-file produced by AUGUSTUS auxprogs tool joingenes \
                 and a corresponding genomic FASTA-file.')
parser.add_argument('-g', '--genome', required=True,
                    type=str, help='genome sequence file (FASTA-format)')
parser.add_argument('-o', '--out', required=True, type=str,
                    help="name stem pf output file with coding sequences and \
                    protein sequences (FASTA-format); will be extended by \
                    .codingseq/.aa")
parser.add_argument('-t', '--table', dest='translation_table', default=1,
                    type=int, help='Translational code table number (INT)')
parser.add_argument('-s', '--filter_out_invalid_stops', dest='filter',
                    type=bool, help='Suppress output of protein sequences \
                    that contain internal stop codons.', default=False)
parser.add_argument('-p', '--print_format_examples', required=False, action='store_true',
                    help="Print gtf/gff3 input format examples, do not perform analysis")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-f', '--gtf',
                    type=str, help='file with CDS coordinates (GTF-format)')
group.add_argument('-3', '--gff3',
                    type=str, help='file with CDS coordinates (GFF3 format)')
args = parser.parse_args()


if args.print_format_examples:
    print('This script requires an annotation of protein coding genes with CDS ' +
        'features either in GTF or GFF3 format. Since both formats are sometimes ' +
        'inhomogeneous accross different tools, we here provide compatible ' +
        'examples. ' +
        'This script will only process the CDS lines. The input data may contain other feature lines that ' +
        'are ignored.')
    print('\nGTF format example:\n')
    print('NW_018027262_1\tAUGUSTUS\tCDS\t347\t525\t0.5\t-\t0\ttranscript_id "g1.t1"; gene_id "g1";\n' +
          'NW_018027262_1\tAUGUSTUS\tCDS\t1791\t2242\t0.59\t-\t2\ttranscript_id "g1.t1"; gene_id "g1";\n' +
          'NW_018027262_1\tAUGUSTUS\tCDS\t2451\t2682\t0.24\t-\t0\ttranscript_id "g1.t1"; gene_id "g1";\n')
    print('GFF3 format example:\n')
    print('NW_018027262_1\tAUGUSTUS\tCDS\t347\t525\t0.5\t-\t0\tID=g1.t1.CDS1;Parent=g1.t1;\n' +
          'NW_018027262_1\tAUGUSTUS\tCDS\t1791\t2242\t0.59\t-\t2\tID=g1.t1.CDS2;Parent=g1.t1;\n' +
          'NW_018027262_1\tAUGUSTUS\tCDS\t2451\t2682\t0.24\t-\t0\tID=g1.t1.CDS3;Parent=g1.t1;\n')
    print('\nThis script has successfully been tested with GTF format produced by BRAKER, and with' +
        'the GFF3 format produced by gtf2gff3.pl (Augustus/scripts).')
    exit(0)

# output file names:
codingseqFile = args.out + ".codingseq"
proteinFile = args.out + ".aa"

# Read GTF file CDS entries for transcripts
tx2seq = {}
tx2str = {}
cds = {}

if args.gtf:
    try:
        with open(args.gtf, "r") as gtf_handle:
            for line in gtf_handle:
                if re.match(
                        r"\S+\t[^\t]+\tCDS\t\d+\t\d+\t\S+\t\S+\t\d\t.*transcript_id (\S+)", line):
                    seq_id, st, en, stx, fr, tx_id = re.match(
                        r"(\S+)\t[^\t]+\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t(\d)\t.*transcript_id (\S+)", line).groups()
                    tx_id = re.sub(r'\"(\S+)\"', r'\1', tx_id)
                    tx_id = re.sub(r';', r'', tx_id)
                    if seq_id not in cds:
                        cds[seq_id] = {}
                    if tx_id not in cds[seq_id]:
                        cds[seq_id][tx_id] = []
                    cds[seq_id][tx_id].append(
                        {'start': int(st), 'end': int(en), 'strand': stx,
                         'frame': int(fr)})
                    if not tx_id in tx2seq:
                        tx2seq[tx_id] = seq_id
                        tx2str[tx_id] = stx
    except IOError:
        print("Error: Failed to open file " + args.gtf + "!")
        exit(1)
elif args.gff3:
    try:
        with open(args.gff3, "r") as gff3_handle:
            for line in gff3_handle:
                if re.match(r"\S+\t\S+\tCDS\t\d+\t\d+\t\S+\t\S+\t\d\tID=[^;]+;Parent=[^;]+;", line):
                    seq_id, st, en, stx, fr, tx_id = re.match(
                        r"(\S+)\t\S+\tCDS\t(\d+)\t(\d+)\t\S+\t(\S+)\t(\d)\tID=[^;]+;Parent=([^;]+);", line).groups()
                    if seq_id not in cds:
                        cds[seq_id] = {}
                    if tx_id not in cds[seq_id]:
                        cds[seq_id][tx_id] = []
                    cds[seq_id][tx_id].append(
                        {'start': int(st), 'end': int(en), 'strand': stx,
                         'frame': int(fr)})
                    if not tx_id in tx2seq:
                        tx2seq[tx_id] = seq_id
                        tx2str[tx_id] = stx
    except IOError:
        print("Error: Failed to open file " + args.gtf + "!")
        exit(1)
else:
    print("Error: Neither annotation file in GTF, nor in GFF3 was provided!")
    exit(1)

# Read genome file (single FASTA entries are held in memory, only), extract
# CDS sequence windows, add N when frame information states missing nucleotides
# Replace all unusual nucleotides in genome by N
seq_len = {}
codingseq = {}
regex = re.compile("[^ATCGatcgNn]");
try:
    with open(args.genome, "r") as genome_handle:
        for record in SeqIO.parse(genome_handle, "fasta"):
            seq_len[record.id] = len(record.seq)
            record.seq = re.sub(regex, r'N', str(record.seq))
            if record.id in cds:
                for tx in cds[record.id]:
                    if tx not in codingseq:
                        codingseq[tx] = SeqRecord(
                            Seq(""), id=tx, description=tx)
                    nCDS = len(cds[record.id][tx])
                    for i in range(0, nCDS):
                        cds_line = cds[record.id][tx][i]
                        if i == 0 and cds_line['strand'] == '+' and cds_line['frame'] != 0:
                            codingseq[tx].seq += Seq((3 - cds_line['frame'])
                                                     * 'N')
                        codingseq[tx].seq += Seq(record.seq[cds_line['start'] -
                                                        1:cds_line['end']])
                        if i == (nCDS - 1):
                            if cds_line['strand'] == '+':
                                if (len(codingseq[tx].seq) % 3) != 0:
                                    codingseq[tx].seq += Seq(
                                        (3 - (len(codingseq[tx]) % 3)) * 'N')
                        if i == (nCDS - 1) and cds_line['strand'] == '-':
                            if cds_line['frame'] != 0:
                                codingseq[tx].seq += Seq((3 - cds_line['frame'])
                                                         * 'N')
                            if(len(codingseq[tx].seq) % 3) != 0:
                                codingseq[tx].seq = Seq(
                                    (3 - (len(codingseq[tx].seq) % 3)) * 'N') + codingseq[tx].seq
                            codingseq[tx].seq = codingseq[tx].seq.reverse_complement()
except IOError:
    print("Error: Failed to open file " + args.genome + "!")
    exit(1)

# Print coding sequences to file
try:
    with open(codingseqFile, "w") as codingseq_handle:
        for tx_id, seq_rec in codingseq.items():
            SeqIO.write(seq_rec, codingseq_handle, "fasta")
except IOError:
    print("Error: Failed to open file " + codingseqFile + "!")
    exit(1)


# Translate coding sequences, identify sequences with in-frame stop codons,
# print proteins in FASTA format
bad_tx = {}
try:
    with open(proteinFile, "w") as protein_handle:
        for tx_id, seq_rec in codingseq.items():
            is_good = True
            seq_rec.seq = seq_rec.seq.translate(table=args.translation_table)
            match = re.search(r"(\*)(\w|\*)", str(seq_rec.seq))
            if match:
                # 1-based amino acid index of stop codon
                bad_tx[tx_id] = match.start(0) + 1
            if ((args.filter == True) and (tx_id not in bad_tx)) or args.filter == False:
                SeqIO.write(seq_rec, protein_handle, "fasta")
except IOError:
    print("Error: Failed to open file " + proteinFile + "!")
    exit(1)

# Print IDs of genes with in-frame stop codons if any were found
if len(bad_tx) > 0:
    try:
        with open("bad_genes.lst", "w") as bad_handle:
            bad_handle.write(
                "# tx_id\tstop_in_amino_acid\tstop_in_mRNA_start\tstop_in_mRNA_end\tgenomic_transcript_start\tgenomic_transcript_end\tstrand\n")
            for tx in bad_tx:
                bad_handle.write(tx + "\t" + str(bad_tx[tx]) + "\t" + str(
                    bad_tx[tx] * 3 - 2) + "\t" + str(bad_tx[tx] * 3) + "\t")
                left_most = -1
                right_most = -1
                for i in range(0, len(cds[tx2seq[tx]][tx])):
                    if (left_most == -1) or (cds[tx2seq[tx]][tx][i]['start'] <= left_most):
                        left_most = cds[tx2seq[tx]][tx][i]['start']
                    if (right_most == -1) or (cds[tx2seq[tx]][tx][i]['end'] >= right_most):
                        right_most = cds[tx2seq[tx]][tx][i]['end']
                bad_handle.write(str(left_most) + '\t' + str(right_most) + '\t' + tx2str[tx] +
                                 '\n')
    except IOError:
        print("Error: Failed to open file bad_genes.lst!")
        exit(1)
    print("WARNING: The GTF file contained " + str(len(bad_tx)) + " gene(s) " +
          "with internal Stop codons (see file bad_genes.lst).")
    if args.filter:
        print("Note: Translations of the bad genes have already been excluded " +
              "from the output file " + args.gtf + ".")
    else:
        print("Note: Translations of the bad genes have been included in the " +
              "output file. If you wish to exclude them, run " +
              "getAnnoFastaFromJoingenes.py with option --filter!")