## Sequence Import Functions

In [1]:
from Bio import SeqIO

In [2]:
test_genome = './test_data/GCA_029694365.1_PDT001647321.1_genomic.fna'
#estimated_genome_size = 2.9
#genome_skew = 0.1

In [14]:
def read_fasta(fasta):
    """
    Function that imports a fasta file into a dictonary
    """
    with open(fasta, "r") as fasta_file:
        fasta_dict = {}
        for record in SeqIO.parse(fasta_file, "fasta"):
            fasta_dict[record.description] = record.seq
    return fasta_dict

In [15]:
test_fasta = read_fasta(test_genome)

In [19]:
def remove_guided_contigs(fasta_dict):
    filter_contigs = {}
    for seq in fasta_dict.keys():
        if '.guided' not in seq:
            filter_contigs[seq.split(' ')[0]] = fasta_dict[seq]
    return filter_contigs

In [22]:
filtered_contigs = remove_guided_contigs(test_fasta)

In [139]:
def concat_record(fasta_dict):
    return ''.join(str(value) for value in fasta_dict.values())

In [24]:
def genome_import_process(fasta):
    """
    Function that imports a fasta file into a dictonary and checks its size
    :param fasta: fasta file
    :param genome_size: estimated genome size for species
    :param skew: the percent skew of the genome size allowed
    :return: tuple of (fasta_file_name, sequence id, sequence)
    """
    
    fasta_dict = read_fasta(fasta)
    
    filtered_contigs = remove_guided_contigs(fasta_dict)
    
    return fasta, filtered_contigs
    

In [25]:
passed_fasta = genome_import_process(test_genome)

## Nucleotide Frequencies

In [26]:
from itertools import product
import pandas as pd
import os

In [27]:
output_folder = './Output/'

In [28]:
def space_seperated_record(fasta_dict):
    return ' '.join(str(value) for value in fasta_dict.values())

In [29]:
def count_nucleotides(input_seq):
    
    sequence = input_seq.upper()
    
    nucleotides = ['A', 'C', 'G', 'T']
    
    nucleotide_counts = {}
    
    for nucleotide in nucleotides:
        nucleotide_counts[nucleotide] = sequence.count(nucleotide)
        
    return nucleotide_counts

In [30]:
def generate_combinations(nucleotides, X):
    return [''.join(seq) for seq in product(nucleotides, repeat=X)]

In [31]:
def count_dinucleotides(input_seq):
    di_nuc = generate_combinations(['A', 'C', 'G', 'T'], 2)
    
    sequence = input_seq.upper()
    
    dinucleotide_counts = {}
    
    for pair in di_nuc:
        dinucleotide_counts[pair] = sequence.count(pair)
        
    return dinucleotide_counts

In [32]:
def count_trinucleotides(input_seq):
    tri_nuc = generate_combinations(['A', 'C', 'G', 'T'], 3)
    
    sequence = input_seq.upper()
    
    trinucleotide_counts = {}
    
    for pair in tri_nuc:
        trinucleotide_counts[pair] = sequence.count(pair)
    
    return trinucleotide_counts

In [33]:
def merge_dicts(dict1, dict2, dict3, dict4):
    merged_dict = {**dict1, **dict2, **dict3, **dict4}
    return merged_dict

In [34]:
def create_dir(dir_path):
    """Create a directory if it does not exist."""
    try:
        os.makedirs(dir_path, exist_ok=True)  # exist_ok=True avoids raising an error if the directory already exists
        print(f"Directory '{dir_path}' is created or already exists.")
    except Exception as e:
        print(f"Error creating directory: {e}")

In [35]:
def create_db(import_sequnce, output_folder):
    
    input_file = import_sequnce[0]
    seq_dict = import_sequnce[1]
    space_seq = space_seperated_record(seq_dict)
    
    input_data = {}
    input_data['file']  = import_sequnce[0]
    
    nuc_counts = count_nucleotides(space_seq)
    dinuc_counts = count_dinucleotides(space_seq)
    trinuc_counts = count_trinucleotides(space_seq)
    
    data =  merge_dicts(input_data, nuc_counts, dinuc_counts, trinuc_counts)
    
    nuc_freq_df = pd.DataFrame(data, index=[0])
    
    file_prefix = os.path.splitext(os.path.basename(input_file))[0]
    
    filename = file_prefix + '.nuc_freq.csv'
    
    output_path = os.path.join(output_folder, filename)
    
    create_dir(output_folder)
    
    nuc_freq_df.to_csv(output_path, index=False)
    
    return nuc_freq_df

In [36]:
df = create_db(passed_fasta,output_folder)

Directory './Output/' is created or already exists.
