In [None]:
'''
GeneCNN, a convolutional neural network-based gene predictor
Developed by Michael Morikone
Requires a genome file and transcriptomic data
Tested with the masked flattened buffalograss genome and 6 transcriptomic datasets from 4 different BioProjects

Predictions were run using the first pseudochromosome of the buffalograss genome
'''

In [None]:
import numpy as np
import tensorflow as tf
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import random
import keras_tuner as kt
from tensorflow import keras
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
np.random.seed(25)
import statistics

In [None]:
#only using first pseudochromosome of buffalograss for comparisons
chr1 = SeqIO.parse("chr1_hard_model_mask.fasta", "fasta")

totalseqs = []
seqs_ids = [] 

#convert SeqIO object into list
for sequences in chr1: 
    totalseqs.append(str(sequences.seq))
    seqs_ids.append(sequences.id)

# Flatten genome and create associated vector with scaffold assignment per base
chromosome = []

for i in range(len(seqs_ids)):
    for j in range(len(totalseqs[i])):
        chromosome.append(totalseqs[i][j])

tiny_chromosome = chromosome[0:9983999] #first 9.984m nucleotides

In [None]:
#outputs tiny_chromosome to fasta file, uncomment if needed

'''test_seq10m = Seq(''.join(map(str,tiny_chromosome)))
test_10m_chunk = SeqRecord(test_seq10m)
test_10m_chunk.id = "First10mChunk"
with open("test_10m_chunk.fasta", "w") as out:
    SeqIO.write(test_10m_chunk, out, "fasta")'''

In [None]:
#sliding window function
def window_split(elements, window_size):
    
    if len(elements) <= window_size:
       return #can't feed a <500 length sequence without padding 
    for i in range(len(elements)- window_size + 1):
        yield elements[i:i+window_size] #generating output, too large to output at once

In [None]:
# data preparation for one hot encoding, conversion of alphabetic representation of nucleotides to numeric representation

#convert all of input sequence into 0-3 to prep for one hot
def ordinal(sequence):
    working = sequence # to not keep calling next on generator
    ordinal_chromosome = []
    
    for base in range(len(working)):
        if working[base] == 'A':
            ordinal_chromosome.append(0)
        elif working[base] == 'C':
            ordinal_chromosome.append(1)
        elif working[base] == 'G':
            ordinal_chromosome.append(2)
        else:
            ordinal_chromosome.append(3)
    return ordinal_chromosome

In [None]:
#one-hot encoding function

def one_hot_encode(ordinal_sequence):
    genome_category_count = 4
    binary_chromosome = tf.one_hot(ordinal_sequence, genome_category_count)
    return binary_chromosome

In [None]:
#load saved model from CNN.ipynb
model = tf.keras.models.load_model("Bayesian2")

In [None]:
#predict sliding windows using saved model, generates output predictions across entire tiny_chromosome

window = window_split(tiny_chromosome, 500) 
first = last = next(window)
current_slide = []
first_loop = True #to get first value of generator
results = []
chunk_size = 16000 #16k for optimal batch sizing
chunk_counter = 0
for last in window:
    if chunk_counter < chunk_size:
        if first_loop == True:
            current_slide.append(one_hot_encode(ordinal(first)))
            first_loop = False
            chunk_counter = 1
        current_slide.append(one_hot_encode(ordinal(last))) 
        chunk_counter = chunk_counter + 1
    else:
        current_slide = tf.convert_to_tensor(current_slide)
        current_results = model.predict(current_slide)
        for i in range(len(current_results)):
            results.append(list(current_results[i]))
        current_slide = []
        current_slide.append(one_hot_encode(ordinal(last)))
        if len(results) < len(tiny_chromosome) - 499: #genome length - sliding window size + 1
            chunk_counter = 1 

In [None]:
#sliding window approach used to generate matrix that has historical predictions for every nucleotide position

total_list = [[2] for element in range(len(results))] #value 2 for initialization to be overwritten by 0 or 1

first_matrix_loop = True
matrix_window_size = 500
for i in range(len(results)- matrix_window_size + 1): #sliding window across matrix
    for j in range(0, matrix_window_size): #assignment of result from prediction sliding window to matrix sliding window 
        if total_list[i+j] == [2]: #removal of initialization value
            if results[i][0] > results[i][1]:
                total_list[i+j] = [0]
            else:
                total_list[i+j] = [1]
        else:
            if results[i][0] > results[i][1]:
                total_list[i+j].append(0)
            else:
                total_list[i+j].append(1)

In [None]:
#create average value for each nucleotide position

averaged_base = []
for i in range(len(total_list)):
    if statistics.fmean(total_list[i]) < 0.5:
        averaged_base.append(0)
    else:
        averaged_base.append(1)

In [None]:
#create contiguous sequences out of all genic predicted nucleotides

current_seq_len = 0
start_location = 0
end_location = 0
all_contigs = []
first_1 = True
for contiguous in range(len(averaged_base)):
    if averaged_base[contiguous] == 1:
        current_seq_len = current_seq_len + 1
        if first_1 == True:
            start_location = contiguous
            first_1 = False
    else:
        if averaged_base[contiguous - 1] == 1:
            end_location = contiguous - 1
            all_contigs.append([start_location, end_location, current_seq_len])
            first_1 = True
        current_seq_len = 0

In [None]:
#create fasta file where all predictions must be 200 bp length or larger
#fasta file has predictions in sequence field and prediction start site in header field

fasta_dict = {}
for fasta_gen in range(len(all_contigs)):
    if all_contigs[fasta_gen][2] >= 200: 
        fasta_dict[str(all_contigs[fasta_gen][0])] =  ''.join(map(str,tiny_chromosome[all_contigs[fasta_gen][0]:all_contigs[fasta_gen][1]]))

output_path = 'chr1_hard_model_mask_test_first_10m_250bp_gene_predictions.fasta'
output_file = open(output_path,'w')
for header, fasta_seq in fasta_dict.items():
    identifier_line = ">" + header + "\n"
    output_file.write(identifier_line)
    sequence_line = fasta_seq + "\n"
    output_file.write(sequence_line)
    
output_file.close()