In [1]:
from Bio import SeqIO

## General Functions

In [2]:
test_genome = '../feature_gen_test/Test_Data/Reference/GCF_009734005.1_ASM973400v2_genomic.fna'
estimated_genome_size = 2.9
genome_skew = 0.1

In [3]:
def read_fasta(fasta):
    """
    Function that imports a fasta file into a dictonary
    """
    with open(fasta, "r") as fasta_file:
        fasta_dict = {}
        for record in SeqIO.parse(fasta_file, "fasta"):
            fasta_dict[record.id] = record.seq
    return fasta_dict

In [4]:
test_fasta = read_fasta(test_genome)

In [5]:
test_fasta

{'NZ_CP038996.1': Seq('ATGGTATCCCTCGATGCTTTATGGAATGAATTAAAAGCAACATACCAAAAGGAT...TAT'),
 'NZ_CP038997.1': Seq('TATCACTATTTAACAATTCTTAACGTTTTCTTAATGTTTATAATCACGTATATA...ACA')}

In [6]:
def get_longest_record(fasta_dict):
    """
    Function that get the longest sequence from a dict of sequences
    """
    longest_seq_id = max(fasta_dict, key=lambda k: len(fasta_dict[k]))
    longest_seq = fasta_dict[longest_seq_id]
    longest_record = (longest_seq_id, longest_seq)
    return longest_record

In [7]:
longest_seq = get_longest_record(test_fasta)

In [14]:
def check_genome_size(sequence, genome_size, skew):
    """
    This function checks the size of the provided sequences and checks if it is within a ranges provided by skew.
    Genome_size in a float in MB.
    skew is a float representing the precentage skew
    """
    bp_genome_size = genome_size * 1000000
    minsize = bp_genome_size - (bp_genome_size * skew)
    maxsize = bp_genome_size + (bp_genome_size * skew)
    #print('Checking that genome size is at least ' + str(minsize) + ' and the it is less than ' + str(maxsize))
    
    if ((len(sequence[1])) >= minsize) & ((len(sequence[1])) <= maxsize):
        return True
    else:
        return False

In [15]:
print(check_genome_size(longest_seq, estimated_genome_size, genome_skew))

True


In [16]:
def genome_import_process(fasta, genome_size, skew):
    """
    Function that imports a fasta file into a dictonary and checks its size
    :param fasta: fasta file
    :param genome_size: estimated genome size for species
    :param skew: the percent skew of the genome size allowed
    :return: tuple of (fasta_file_name, sequence id, sequence)
    """
    fasta_dict = read_fasta(fasta)
    
    longest_seq = get_longest_record(fasta_dict)
    
    if check_genome_size(longest_seq, estimated_genome_size, genome_skew):
        print('Input sequence ' + fasta + ' passed checks')
        return fasta, longest_seq[0], longest_seq[1]
    else:
        print('Input sequence ' + fasta + ' did not pass checks')

In [17]:
genome_import_process(test_genome, estimated_genome_size, genome_skew)

Input sequence ../feature_gen_test/Test_Data/Reference/GCF_009734005.1_ASM973400v2_genomic.fna passed checks


('../feature_gen_test/Test_Data/Reference/GCF_009734005.1_ASM973400v2_genomic.fna',
 'NZ_CP038996.1',
 Seq('ATGGTATCCCTCGATGCTTTATGGAATGAATTAAAAGCAACATACCAAAAGGAT...TAT'))