In [12]:
amino_to_codon = {
    'A': ['GCT', 'GCC', 'GCA', 'GCG'],
    'C': ['TGT', 'TGC'],
    'D': ['GAT', 'GAC'],
    'E': ['GAA', 'GAG'],
    'F': ['TTT', 'TTC'],
    'G': ['GGT', 'GGC', 'GGA', 'GGG'],
    'H': ['CAT', 'CAC'],
    'I': ['ATT', 'ATC', 'ATA'],
    'K': ['AAA', 'AAG'],
    'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'M': ['ATG'],
    'N': ['AAT', 'AAC'],
    'P': ['CCT', 'CCC', 'CCA', 'CCG'],
    'Q': ['CAA', 'CAG'],
    'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'T': ['ACT', 'ACC', 'ACA', 'ACG'],
    'V': ['GTT', 'GTC', 'GTA', 'GTG'],
    'W': ['TGG'],
    'Y': ['TAT', 'TAC']
}


def peptide_to_dna(protein: str) -> list:
    encoding_list = [''] # empty string to allow concatenation
    for amino in protein:
        codon_list = amino_to_codon[amino] # list of encodings for amino
        new_encoding_list = []
        for curr_dna in encoding_list:
            for next_codon in codon_list:
                new_encoding_list.append(curr_dna + next_codon)
        encoding_list = new_encoding_list
    return encoding_list

tab_input = 'ATGC'
tab_output = 'TACG'
trans_tab = str.maketrans(tab_input, tab_output)
def reverse_complement(pattern):
    return pattern[::-1].translate(trans_tab)

def find_encoding_subtrings(dna: str, peptide: str) -> list:
    n = len(dna)
    k = 3 * len(peptide)

    encoding_sequences = set(peptide_to_dna(peptide)) # list to set to allow for O(1) searching

    result = []
    for i in range(n - k + 1):

        dna_substring = dna[i:i + k]
        rc_dna_substring = reverse_complement(dna_substring)

        if dna_substring in encoding_sequences or rc_dna_substring in encoding_sequences:
            result.append(dna_substring)

    return '\n'.join(result)


def main():
    
    file = open('rosalind_ba4b.txt', 'r')
    
    
    dna = next(file).strip()
    peptide = next(file).strip()
    print(find_encoding_subtrings(dna, peptide))

In [13]:
if __name__ == '__main__':
   main()

GCCGTAACCATCCACACACCACCT
CGTTGGTGCGTGGATGGGTATGGC
AGGTGGTGTGTAGACGGCTATGGG
CGGTGGTGTGTCGACGGTTATGGG
AGATGGTGTGTGGATGGATACGGG
GCCATAACCATCTACGCACCAGCG
ACCGTACCCGTCCACGCACCATCG
ACCGTATCCATCAACGCACCAGCG
ACCGTATCCATCAACACACCACCT
CGGTGGTGTGTGGACGGATATGGG
CGCTGGTGTGTTGACGGTTACGGT
CGTTGGTGTGTAGATGGCTATGGC
CGATGGTGCGTGGATGGGTACGGC
AGATGGTGTGTAGATGGGTACGGA
TCCATACCCGTCTACACACCACCT
CGTTGGTGCGTGGACGGCTATGGA
AGGTGGTGTGTAGATGGATACGGT
CGGTGGTGCGTCGACGGATACGGA
TCCGTATCCATCTACGCACCACCG


In [14]:
def main():
    file = open('rosalind_ba4b.txt', 'r')
    
    
    dna = next(file).strip()
    peptide = next(file).strip()
    
    result = peptide_encoding_sequences(dna, peptide)
    for r in result:
        print(r)


AMINO_TABLE = {
    'A': ['GCT', 'GCC', 'GCA', 'GCG'],
    'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'N': ['AAT', 'AAC'],
    'D': ['GAT', 'GAC'],
    'C': ['TGT', 'TGC'],
    'Q': ['CAA', 'CAG'],
    'E': ['GAA', 'GAG'],
    'G': ['GGT', 'GGC', 'GGA', 'GGG'],
    'H': ['CAT', 'CAC'],
    'I': ['ATT', 'ATC', 'ATA'],
    'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'K': ['AAA', 'AAG'],
    'M': ['ATG'],
    'F': ['TTT', 'TTC'],
    'P': ['CCT', 'CCC', 'CCA', 'CCG'],
    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'T': ['ACT', 'ACC', 'ACA', 'ACG'],
    'W': ['TGG'],
    'Y': ['TAT', 'TAC'],
    'V': ['GTT', 'GTC', 'GTA', 'GTG'],
}


def _amino_to_dna(amino: str) -> list:
    """
    Convert an amino acid to a DNA sequence that encodes this amino acid
    :param amino:
    :return: A list of all DNA sequences encoding the given amino acid
    """
    return AMINO_TABLE[amino]


RC_PIPELINE_A = ['A', 'a', 'T']
RC_PIPELINE_C = ['C', 'c', 'G']
RC_PIPELINE_T = ['T', 'A']
RC_PIPELINE_G = ['G', 'C']


def _reverse_complement(genome: str) -> str:
    """
    Get reverse complement of the given genome
    :param genome: sequence to transform
    :return: RC
    """
    return genome.\
        replace(RC_PIPELINE_A[0], RC_PIPELINE_A[1]).replace(RC_PIPELINE_C[0], RC_PIPELINE_C[1]).\
        replace(RC_PIPELINE_T[0], RC_PIPELINE_T[1]).replace(RC_PIPELINE_G[0], RC_PIPELINE_G[1]).\
        replace(RC_PIPELINE_A[1], RC_PIPELINE_A[2]).replace(RC_PIPELINE_C[1], RC_PIPELINE_C[2])\
        [::-1]


def _peptide_to_dna(protein: str) -> list:
    """
    Convert a peptide into a list of DNA sequences that encode it
    """
    result = ['']
    for amino in protein:
        next_dna_variants = _amino_to_dna(amino)
        next_result = []
        for prev_dna in result:
            for dna_variant in next_dna_variants:
                next_result.append(prev_dna + dna_variant)
        result = next_result
    return result


def peptide_encoding_sequences(dna: str, peptide: str) -> list:
    """
    Find all (sub)sequences encoding the given 'peptide' in the given 'dna'
    :return: a list of sequences found
    """
    all_peptide_encoding_sequences = set(_peptide_to_dna(peptide))

    result = []
    for ss_start_i in range(len(dna) - len(peptide) * 3 + 1):
        ss = dna[ss_start_i:(ss_start_i + len(peptide) * 3)]
        if ss in all_peptide_encoding_sequences or _reverse_complement(ss) in all_peptide_encoding_sequences:
            result.append(ss)

    return result


In [15]:
if __name__ == '__main__':
   main()

GCCGTAACCATCCACACACCACCT
CGTTGGTGCGTGGATGGGTATGGC
AGGTGGTGTGTAGACGGCTATGGG
CGGTGGTGTGTCGACGGTTATGGG
AGATGGTGTGTGGATGGATACGGG
GCCATAACCATCTACGCACCAGCG
ACCGTACCCGTCCACGCACCATCG
ACCGTATCCATCAACGCACCAGCG
ACCGTATCCATCAACACACCACCT
CGGTGGTGTGTGGACGGATATGGG
CGCTGGTGTGTTGACGGTTACGGT
CGTTGGTGTGTAGATGGCTATGGC
CGATGGTGCGTGGATGGGTACGGC
AGATGGTGTGTAGATGGGTACGGA
TCCATACCCGTCTACACACCACCT
CGTTGGTGCGTGGACGGCTATGGA
AGGTGGTGTGTAGATGGATACGGT
CGGTGGTGCGTCGACGGATACGGA
TCCGTATCCATCTACGCACCACCG


In [None]:
GCCGTAACCATCCACACACCACCT
CGTTGGTGCGTGGATGGGTATGGC
AGGTGGTGTGTAGACGGCTATGGG
CGGTGGTGTGTCGACGGTTATGGG
AGATGGTGTGTGGATGGATACGGG
GCCATAACCATCTACGCACCAGCG
ACCGTACCCGTCCACGCACCATCG
ACCGTATCCATCAACGCACCAGCG
ACCGTATCCATCAACACACCACCT
CGGTGGTGTGTGGACGGATATGGG
CGCTGGTGTGTTGACGGTTACGGT
CGTTGGTGTGTAGATGGCTATGGC
CGATGGTGCGTGGATGGGTACGGC
AGATGGTGTGTAGATGGGTACGGA
TCCATACCCGTCTACACACCACCT
CGTTGGTGCGTGGACGGCTATGGA
AGGTGGTGTGTAGATGGATACGGT
CGGTGGTGCGTCGACGGATACGGA
TCCGTATCCATCTACGCACCACCG