In [None]:
import sys, os, tarfile
from tiberius.genome_anno import Anno
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tiberius.eval_model_class import PredictionGTF
from Bio import SeqIO
from Bio.Seq import Seq
from tiberius.main import (assemble_transcript,
                            check_in_frame_stop_codons,
                            check_tf_version,
                            download_weigths,
                            extract_tar_gz,
                            load_genome, compute_parallel_factor)

In [None]:
# download model weights
# set model_path to a local model if you dont want to use the default model

# set parameter
batch_size = 2
seq_len = 259992
strand = '+'
hmm_parallel = compute_parallel_factor(seq_len)
model_path = "../../model_weights/v2/tiberius_weights_v2" #None

# Download default model
if check_tf_version(tf.__version__):
    model_url = 'https://bioinf.uni-greifswald.de/bioinf/tiberius/models/tiberius_weights.tgz'
else:
    model_url = 'https://bioinf.uni-greifswald.de/bioinf/tiberius/models/tiberius_weights_tf2_17.keras'
if not model_path:
    model_weights_dir = f'../../model_weights'        
    if not os.path.exists(model_weights_dir):
        os.makedirs(model_weights_dir)
    
    model_file_name = model_url.split('/')[-1]
    model_path = download_weigths(model_url, f'{model_weights_dir}/{model_file_name}')
    if model_path and model_path[-3:] == 'tgz':
        extract_tar_gz(f'{model_path}', f'{model_weights_dir}')
        model_path = model_path[:-4]

# extract test_data if necassary
inp_data_dir = 'inp/'
if not os.path.exists(inp_data_dir):
    os.mkdir(inp_data_dir)  
    with tarfile.open("inp.tar.gz", "r:gz") as tar:
        tar.extractall(path=inp_data_dir)

out_dir = 'test_prediction/'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# input genome file
genome_path = f'{inp_data_dir}/genome.fa'
genome = load_genome(genome_path)
# output gtf file

# init PredictionGTF object
pred_gtf = PredictionGTF( 
    model_path=model_path,
    seq_len=seq_len, 
    batch_size=batch_size,
    hmm=True, 
    temp_dir=None,
    num_hmm=1,
    hmm_factor=1,
    genome=genome,
    softmask=True, strand=strand,
    parallel_factor=hmm_parallel
)

In [None]:
# load model
pred_gtf.load_model()

# load genome data
genome_fasta = pred_gtf.init_fasta(chunk_len=seq_len, min_seq_len=500)
x_data, coords, adapted_seqlen = pred_gtf.load_genome_data(genome_fasta, [],
                                                softmask=True, strand=strand)

In [None]:
# generate LSTM and HMM predictions
hmm_pred = pred_gtf.get_predictions(x_data, hmm_filter=True)

# infer gene structures and write GTF file
anno, tx_id = pred_gtf.create_gtf(y_label=hmm_pred, coords=coords,
        out_file=f"{out_dir}/tiberius.gtf", f_chunks=x_data, strand=strand)