## Protein Translation Problem

In [None]:
with open('data/RNA_codon_table_1.txt', 'r') as reference_file:
    codons = [line.strip().split(' ') for line in reference_file.readlines()]

In [None]:
CODON_TABLE = {
    codon[0]: (codon[1] if len(codon) > 1 else '')
    for codon in codons
}

In [None]:
len(CODON_TABLE)

In [None]:
sample_input = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'
sample_output = 'MAMAPRTEINSTRING'

In [None]:
def protein_translation(rna):
    codons = [rna[i:i+3] for i in range(0, len(rna)-3+1, 3)]
    amino_acids = [CODON_TABLE[codon] for codon in codons]
    return ''.join(amino_acids)

In [None]:
assert sample_output.strip() == protein_translation(sample_input)

In [None]:
# open('data/dataset_96_4.txt', 'r')

---
## Peptide Encoding Problem

In [None]:
reverse_bases = {'A':'U', 'U':'A', 'G':'C', 'C':'G'}

In [None]:
sample_dna = 'ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA'
sample_peptide = 'MA'
sample_output = [
    'ATGGCC',
    'GGCCAT',
    'ATGGCC',
]

In [None]:
def dna_to_rna(dna):
    rna = ['U' if base == 'T' else base for base in dna]
    return ''.join(rna)

In [None]:
def reverse(rna):
    return ''.join([reverse_bases[base] for base in rna[::-1]])

In [None]:
def encode_peptide(peptide, dna):
    k = len(peptide) * 3
    kmers = [dna[i: i + k] for i in range(len(dna) - k + 1)]
    results = list()
    for kmer in kmers:
        rna = dna_to_rna(kmer)
        if protein_translation(rna) == peptide or protein_translation(reverse(rna)) == peptide:
            results.append(kmer)
    return results

In [None]:
assert sorted(sample_output) == sorted(encode_peptide(sample_peptide, sample_dna))

In [None]:
test_file = 'data/dataset_96_7.txt'
test_dna = 'AACAAAGTCGCGGAAACTCAATGGCTGCTTACGGATCCCAGAAACTGGATCCTTTGTATTAAGCTGGAACTTTGCATAGACCGCTATTTCGTCCATCGGGGATTGCATTCTGAGGTACCGAACCTCCAATGGCAAGAAGCATATCGGCGTCGCCTTTTATCCACCGTACTGCCCTACCGCTGATGGTAGCAAGGTAGGCTGTGCTGGCGTTACATTCGCGATATTAAGTAGATATATGAAACAACGCTTACTAGCAGCTGTATTTATCGCTTATCTCGCGGACTCAAATAGTAAGTCGCTCCAGCTCCTAGGGGGGACGATACAAGGATCTAAAAACGTCCGTTGTTCCTTAACGTGTTACCTGCCGCTCTCACCATGATCTGAGTCACTCCGGCTATCGACTGGAGGCCCCTGCACTCCGAAGTACCTAACTAGGTTTGGCACCTCCGAGTGCAAGCGGTATGGATAATGTTCATAAGACCCGACACGATCCCGAGACCTAACGAGTATCGGTATCGCCTGTACACCTTTGACTAGGTTATAAGGGACCATGCGCTCTCGAACGTATTCCAGGATTATAGGATCTGGTACTCCTGTATGAAATACTCTTAGGCCTGGCTACTTCTGGGACGGGAGGTGAGCTTACCGACGGGAATGGGAAACCCCCTCAAACTACAAGGCGTTAGTCCCCACGGAAGGCAAATGGAGCACAGGGATGCCCTCCGATCTCACATGCCTACAGGCCTTGGTTACGCTGCATCTGGTATCTTATCCTGTTACAGCGACCCCGGGGTCATCAAGGGTTCGTCGCGTCGTAGTCTTGATAGATAGGTGCAAGCGCGGTAATTGTGGGAAGAATTCGCCTTGGTGCGGAGATCCGCCGGGTCCTCCCGGGAGAGGTAATCCGGGCCTGTCCCGACGCGTTGTGGACGTCCAGGCAATGTTTTTCACGCTTATTGCGCAACCACCCCTAAGGCTCGGGCTTTACTTCTTAGGCGAGGAAGTTGGAACCACCGCAAGTTATTTTTTGCAGCACAACGAGGAATAAATACTGGCGGGTTCCTACACCACTATGTGGCCCTCGCAACTTACCAAATGAATACTTGCTGGCGTAGATTTCTCTGAGCTGGGTTCTAACCAAGTCTGTGGGATCTAGACGTAAGACAGACATGACCCTTATTGAGGAAGAAGTTCCTGTCTTCTGATTGTTGACGCACTTTTACCAAAGGCGCCGCCAGAACTGTCCCATAAATCAGCTCTTGCATGCCCAGTCAGCCATCTGCTTAAAGCCGAAGTCCCAATCACTCAAGCTTAACGAGATCCGCCCCGCTTCATTGAAGCTATGATGCTTGCATATCCCGTCTGGCAGTTCTTGCCATCTCCATGTTGTGGTGCAGGTACTACCATTCACAACAATGTTACGGCGCCATGGCGGGCCTATACTGGCCAGTATCACACCTTCATTCAGAAGTTCCAAACCTGTTCATATGCTAGCAGCGCGGAGTGAGCTATCGGGATTAATCAACCTACGCTTGACCGTTTGCAAGTCGATGCCCGTCGGTGGTAACCAAAGTTGCGAGCTCGTATGCTACCAATATCAATCAGTCCTTGCACGTGGATTCGGCATCACTAAGTTGGGCACCTCCGAATGAAGCGAGTACAGCTCTTCTAAACGGGGACACTGAGGACGATGTCAAGGGCGTTACCGACGCTACACACTGTCCGTACTCTCACCTCAATTTCCTTTATCCTATGTGCTGTAGACGTTATGTTGGCGATACTACCGATGCTCACCTTTCCGGTGCCGGCGATAGCGTACTCATTTCATTTATAGCAAGAACTAACGATCCGCCACTGATGCGTAATTGGTGATCGATTGTCAATTTGCCTAAGTTCAACACCCTGCCTACGCCGCCCGTGAGATAACCAGGAGTTCTCTCCCCGGCTTGATCATTCGAATTGCATAACGTGTTGCCGTGTTCACGATGATTCTGGACTAAATTAGGTACCTCGCTGTGTAACCCTCCCGCAACCAATTTCGGGATCGTAGCGGTACGACGAGTCGGTGGCTTCAAAGGCTATCCGGAATGGTGTGAACCTAGGTCGCTACTTCCCCCATACTCTCCAGTTGACTTAATCAGTGTTGCGTACATAGTACGCGCCAAGTCTTTTTCAGGAAGCGGAGGGAATTAACCATGCGTACCGCACTATCCGGATATTGGGAGGATCCAGCGCATGAAGTAAACTGAGACTCCTGTACTAGAGGAATGCCCAGCTGAGGGTTTTGACCATCAGCGCAGACGGACAGCGGACTTCGTAACACTTGATTGTTAGGCACCAGTGAGGAATGGATATCTGCCGCACTAGTCTCTCGCACACAGTCCTGTCCGTTGCGGCCACTTCTATCGTCTTCTATAGTATCTGCATTCGGAAGTCCCAAACTTAAGAAACTTCGAGTTCATTGGTAAGTGGAGATACCCCCACGCCGGGCACTACTATTATTAACCGTCGAACAAGTTAGTCTACTCACGTTTTAGCGATCAGACTCGTCGTGAAAATCTACCATGAGTAGATTGGATTCATACAGAATACAAGTCCTCTGCAACCGGCGGCCCTTTATGGCCTGTATTGCAGCTTCACTCTGAGGTGCCTAACTTGAGACTGTTTAAAGGCGCGTCCTTCCACGGCACGGGAGGAGAGGCGTCTTGACGCCCCGAGTGCGCCGGGTATGAGGAGCCGGACGAGAATTGGCGGCTCAGACTCGAAATTGATTCTTCGCCGTGTCTCTTTTCTGAAGTAAACATATCCCTGCTAGAAATGCCAATGTTGGGGTAGGACAGCTGGCTGCATCACTGTATTTGCATTCCGAAGTGCCCAACTTGATGGGTGCAGAGTTTGCGATGAGGACTCACCAGGGCGGAATTCCAGTCGGCGACGAGCTTACATAGCGAGGTGCCGAATCTGGCATCATTAATGTTAGCGTCGGCTTGGTGTCAACTGACATACGAGTTCCGCGAGACGGAAGATCGGGAGGTTCAGGCGGAGCATTTCATCCAGGGAAAAAACTTTGCACCCCGACACTGAAAATCAAAGTAACTGCTTACTTTCTTTCCGCTTGATGGCATCCCACTTATTAATCTGTCCTAATGGGGTCTCATAGAGCCCGACACGAACTTAATCCATGTGTTAGACGCTCGGACTAGGCAGGATAGCGATTCGTCGTCATATGCGTGTGCTAAGTCTTTCAGATTCGGCACCTCGGAATGCAGGACCATAAAGCGGACAGGCACGGGCGGTGGGTTATTCATGTGCTAGAAGGGACGTGGTCGACGGCGGATGTGTAACTTGCCGTGTTCAGTGTACAAAGTCTAGTTAGACCCAACATGGATCTCACCTTGACCAGCTATTCACGGGAATATTAGCGATTCGTACTCGTTACACCGCCTTTCTAAATAGCCCGAGGCACTAATGCCCAGAGGTAACCTGTTAATTGGGGGCCAGCAATGAGCGTAGAATCTAATCGGTACTCTCTCCGTTTCGGCCAGATTTCGAGGGTCTCGGGGTTTAGAGTCTGTATCACAGTCTATCGGCATGAACTGAGGGCCCGTGGTGGCGGAACCATTATGGCGTACGCCTATAATCCAAAAAAAAACGCTCTTTCTCTCCGGAATTTCGTAATATCGGGATGTCCATGGGCACGTTTATGGCCGTTTGTTGGTATGGTAGGTACTAGTGCGCGAGCGTCCCCGTTCGAAATCCCACATTCTAGGGCACGCTTCCGCAGCTAAGAGATCCCTGTGCAGTGCCAGCTCAGCTAACTAAGTGCTATCCATCCGAGGCCCGCCTCTTATAGGGCATATAATCCAACTACCACCGTTTCTCCTTTCGACCCTTCAAGTTACGCTTTTGACACCGCATCGATAACGAACAGTGACTTTCGGTTACTCGCTGACGTTGTTTGACCATGCGCGTGGATGAATAAGTAGGCCAATGGTCTCCCTAGATAGGACTCCATTCGGGTGGTCAGACGCTCGACCCCCCGTATGCGTCGGTACACAAGATGACTCAATTTTAGCCCACGGTTGTCGAAGTACGCAACGAGGGTTACAACTGTGTTGTTTATGATCAGTTATGGTCTGGGGTCCCTAATTAAGCGAATCTAGTTGGCGCACATAACTCGCAGATATGTACAAGCTGGCTTTATGTGTAGCTGTTTTATTAGGATCGGCCTTTCAATAGGTGCTTATCCTACGCTGACCTCAGGGTGGCTCCGTAACGGGCGGCCGCAGGGGAATTTCTGCCCATTTGTTACCACGTGTACCGGACCTTACTGCCTCACGGACCAGCAATCATAGACTTACACTTAGTCCACTTACGGGACGATCATCACCTGAAGAACTCGTTTCGTCACGCAGTCCGGGCCTGTGGCCTGTGATAAGAGACTGCGGAGAGTTGAGGATGGGCTAGCGCCTCATACTCTTGATGGGTGAGGCCAACGGGTTAGCAAGCCCATAGTGACTCGTTGCTGGTGCTGGTTACTTTAGCTAGCATTTTTCGAACAGTATTGCCGAAAGACTAGGAAAGGCGTTAACTAAAGCGGTACATGTTCCAAGAGTTCGTCACAATCCCCGTTGAACCGATCGAGCTTCAACAAACGGATCCAGCTTCCCATGTATGATAGGCGGACGAGGACGATGGCCGATGTCACGAACCCCGAACCGTGCCGCGTGACGGGACGTTGCCTTGCCGTGCGCGAGCAATATGACGTGTCGGCTGCCGTGTTTCATAAGGTAAGTGCCATTCCCGAGCACCGAAGTCGTTGGCGCTGACAATTATTCAGAGATGATCGACGTTGCCATAAATTGACTTAAACAGGCACGGGGATAAGTACGAGGTGAAGTATTATAACTATCTGATGCCCGTTCTAACAGGGTAAAGCCGGCCGTATAAATACGGGAACACTACCATTATCGGATTCCACATGCTCTCATTTCGGCCAATAGCGCTCTCGAGGCGATCCTTTCGTTAAAGTATGTGTCGCTGATCACCGCCCGACCTTTTAAAGCCCCCTGACCTGAGAAAAGACGACTTAATAGGAAATTTCGCGTCTAGTGCCCTCCATCGCAGTGTCCGGCTAAACTCGGTTACATCACCGGTCGGAGAATACAGCATCGACGCACAACAGGCAGTACCCGGTCCTCGTGACACGATTCTTTTTGGTTAGAAAACGCAGCCATTTGGACTTCTATTCAGACGTCCATACACACACAATTTCCGCTTCAGCCGGGACACGCCGTACGCAGGGAAGGGCCACGCCTATCAATATAACGAATAGTAGGGACCTTCGTTTTTAATCGGCCTGAGCGGGTAACTACGCAGTAATGTCCGGTCTAAGACTAACTTCTCGTGCCCCGAATGGGCGTAAATAAAAAGGTGCGAATCGGGCGCACCTATTATTAACTGCTCATGCTAACACCCTTTCCGTAGTTTTACACCACGTCATATGACTGTTTTCGATGCGCAGACTATGAACGACTGTCTGCCTCCGGACTGATACGTCTCGCCAGCCATACAATATCGTAACGCTTGATTGGCAAAAGAGCTAAGACACTTATCGCCGCAGGTAGACGCACGCCGGATTACTTTTTTACACAGTGAAGTTCCAAATCTAGCAACTGAGACCTGTTACGACCGCAAGTCACGACGTGGGGATTTCATCCGACTACGAACGACTGCAAGACAATCTCGACAGAATGTGGGAAGTTCTAGTCTTTAACAGCGAGGAAGTCCTTCATGTCTTCAAAACGAGACTGGTACTTGACGCTTTCCAGGGCCGCTTCTCTTACGTTTTCGCGAGGGAAGTAAAGAATATCTGAGTTGTGGGCATTAAATTGGGCACCTCGGAGTGTAAGTATATGCGGCTCGATCAAATTAGGTACTTCACTGTGTAGTCCGCCTCTGTCAAGTCATTCACGCACGAGCAATGTTTCGCGTGGAGCGAGAAACGCGTATGTGAAGGGCCACATAATCGCGCTTACTTGACCTGCTTTTGATAGACACCGCACGCTGGTCTCCTCCAGCTCTCCACCGAGGGCGGGCTCCCGCAAGGTTCTCGGTTCCCCAGTACCGCTGAAGCTTATTGATAGTGTGCGCCGCTCATCGGTCGAGCCTGGCGAGAAATTATCCATAGGTAATCATTAATGGACCGCCCTCATGCCGGTCAATAACGTACAGCCACGCGGCCCGCAGAGGAATCCTCTACACGTACAGACGACGTCCAGCGACCGAATTCTCCCGTCGGTGAACTATTAAGACCGAGACGATTGCTGGAACGTTATAGACTGTATAGAGAAATCTAAGTTGCGCCATTACAATTGAGTCATGATCAACTTAAACCTTACTATTGGCCGACGACGCGCAGCCCTGATGCATTGGACTCGTACAATCAGAAAGTAAAGTTGGATTCGAAAGTTCGAATCTAGCAACGAGACTGCACTCAGAAGTGCCTAATTTAATCTTTATTGTATTAATACAGGCCCCGACAATCGCCGTTTATTTGGGGCTTGTGTTCCTAGAACTTTGACTTTGATCGACACGTCGCTTAGGCTCCACTCTGAGGTTCCTAACCTCTGTTCCTAGGTTTCGCAGTGTCACCGATTTGTCTTGGATCGTACCTCTGATGTTAAGCAACTACTCGCGTATAGGAGATGCTCATGCTCTTTCCAAAGCAAATCCTGGAAATCGATATGCGACCGAGTCTTGCGCAAACAAAATTTATACGTGCGCGACGTAGGATGCTACCGAATAAGTAAACTCGGGCTTAACCGGTGTGGGGTGCGATGGCATATCTCCGGGGCCAGGGCTCGGGCAGCCAGAGCATTACAACGAAGTAGTTTCAACAGGCCTTGACATGATTTAAACAGCAGGCCACGCATGTATACCTGGGGGGCATAACAAAGGGGTCTCGTTGCCATGGTGGAACTAGACGATATTATCTCCGCGAGCAGGAGTCAGGAGCCACCGTAATGGGCAACAGATCACTCTGTGTTGTCAGAAGTTCACCCATGTGTGACCCGCATTAGGGGATCTAGACGGTGCCTGTTTTGGAGACATTTACGCCGTACTCATTTCTCCCGCGCCCTGGTAAGCCGAATGCGACGCTCAGACTTCGACGTACCGATAAGGGGCAACCTTAGGCCACAAATGACCCGATAAAAGTCAGCAGTTCGCAAGGGTTGGTAGTTGATCGTATCCGTCATAGAGCCCTCGTCGGCTGCTAAGTGTAGAACCACCGACTCTTCGCGTCTGCCCCAGGGCCTTTCATCGCGACCGGTCGTAACACTCCATTGCCAAGAACAACTGCACTAGGTTACGCTAGTTTCATGTACAACGCAATTACGCATCGGGTCTGAGGGCGTCACAGTGATGGCATATATATGCATTTAATTGTTTTTACCAGCAAGATGCGACCAAGACTCTTGAGTCAAAATCTCCTCCGATAAAGCGCGGGGAAGGTCCAACGTTACTTCACTTGCCTCGGCTCCTTTGATATCAATTCGGAGACCCTAGACCTGTGTGCTTAAGTAAGGATGAACACCGTGGAAACCCACACGGCATCACTGAGTCAATGGTGCTCGAAAGAGGAATGACGTGTCTTTCCGCAGGCCGTGCCCGCGCGCCTCTCGCGGTCGATTACGGCAACTACCCGCTGGACTCTCAGCGGGTGGTTACTCTGCTGGGAGGCCCATAGTCGACACCTACTACGATCTCTACACACCGTTTACCGTTTTGTTGATCTGCACAGTGAAGTCCCCAACCTCCTGTTGTAATGCTGGGCGGTCGTACTTCATCAATCCCAATATTACAGCATTGATATCCCACGGCAGACCACCTTAAAGACTAGGAGGCGAGACGCACACAAGAGTTGCTTGCATGGCATGATGTTTACGCTGGCCCAGGGTTTACTGGGACACACTATGTGTTACGATGTGACTCGGCTCACGTTTGCGGGAAGGGGACAGGTACAAGGTGGAGCGCGTTGCATGTCTCCTATGTGTATCGTTATCTGGACCTAATCTTAAATTACCACGCGAGCGTCACTGGGAGGGGCTCATACCCACAGTCGAATTTATCTGACACACCCAGATTTATTGACGTTAATCTGCTCTAATTAGGCAAACAATTCTTCGGATCCCCACACCCACCGCGATGCTAGTTAAGGGACGTTAATTGTGACAAGATGATTCGTAGGTAAGACTCAATTAGCTTACTCCCCACGTGCGCCGTGCCCGAGGCCGCTTAGGTACAATACTGTATGAAATATCTACGCCAACCAAGGCGCTAGTATGCTTGCTGACCGGACTCGCTAGGAAACATTTTGAATTCTTCAGTACATGAAACGCAGCAAACATACTAAGAATAATCTCAAAGACGTGAGATCGTATTACTTCCGACTGCACTCCGAAGTTCCAAATCTTTTTGGAGACGTTCCTCGCTAGCCGGCGCATACTCTGCATACTCCCCTTTGCCACGCGGACCACGTGTTACGGGGCATACCGTGCTTGCCCGAGTAGTACGCCACGCATTTAACGAATAAGGCGGGAACCTAAGGAGATTACGTCCAAGAGGAGGTATGGTAAGCGAATTCGCGTTGAGTCACCATATCGTGTAGTCTCTGCTCAATGGATCCCCAATCGGTGCGTTTTCAGGTCCCTGCACTCCGAGGTGCCAAATTTACGGTTAGCTCGCTTGGAGCCAGGTCTCCTGGACCCCAATTTGCGAACTCACTACACCAGTTAATACTTTGCACAGTGAGGTACCGAACCTCAGACCAGGGGGGCGCCAGGAGGTTTCCTCATAATTGCCATCCTCATCTCTCTGTTCTTGGTAACGGGGCATGACAGGCGAGTAAGCGGGTTAAATGTGATCTTGCCTATTGGCCCTCTCTG'
test_peptide = 'LHSEVPNL'
test_result = encode_peptide(test_peptide, test_dna)
# print('\n'.join(result))

In [None]:
with open('data/Bacillus_brevis.txt', 'r') as genome_file:
    bacillus_brevis_genome = [line.strip() for line in genome_file.readlines()]

In [None]:
len(bacillus_brevis_genome)

In [None]:
tyrocidine_b1 = 'VKLFPWFNQY'

In [None]:
## result = encode_peptide(tyrocidine_b1, ''.join(bacillus_brevis_genome))

In [None]:
## len(result)

---
## The Cyclopeptide Sequencing Problem


In [None]:
from math import factorial

def nb_subpeptides(cyclopeptide_length, k=2):
    n = cyclopeptide_length
    binomial_coeff = factorial(n) / (factorial(k)*factorial(n - k))
    return int(k * binomial_coeff)

In [None]:
assert 980597910 == nb_subpeptides(31315)

In [None]:
nb_subpeptides(33802)

---
## Generating Theoretical Spectrum Problem

In [None]:
from utils import INTEGER_MASSES

In [None]:
from collections import deque

def generate_theoretical_spectrum(cyclic_peptide, integer_masses=INTEGER_MASSES):
    peptide = deque(cyclic_peptide)
    counter = 0
    masses = [('', 0)]
    while counter < len(peptide):
        amino_acid = ''
        mass = 0
        peptide_ = list(peptide)
        for letter in peptide_[:-1]:
            amino_acid += letter
            mass += integer_masses[letter]
            masses.append((amino_acid, mass))
        counter += 1
        peptide.rotate()
    last_letter = peptide_[-1]
    masses.append((amino_acid + last_letter, mass + integer_masses[last_letter]))
    return masses

In [None]:
def display_masses(spectrum):
    return ' '.join(map(str, sorted([m for (aa,m) in spectrum])))

In [None]:
sample_input = 'LEQN'
sample_output = '0 113 114 128 129 227 242 242 257 355 356 370 371 484'

In [None]:
sample_spectrum = generate_theoretical_spectrum(sample_input)
assert sample_output == display_masses(sample_spectrum)

In [None]:
extra_input = 'IAQMLFYCKVATN'
extra_output = '0 71 71 99 101 103 113 113 114 128 128 131 147 163 170 172 184 199 215 227 227 231 244 259 260 266 271 286 298 298 310 312 328 330 330 372 385 391 394 399 399 399 401 413 423 426 443 443 470 493 498 502 513 519 526 527 541 554 556 557 564 569 590 598 616 626 640 654 657 658 665 670 682 697 697 703 711 729 729 753 753 771 779 785 785 800 812 817 824 825 828 842 856 866 884 892 913 918 925 926 928 941 955 956 963 969 980 984 989 1012 1039 1039 1056 1059 1069 1081 1083 1083 1083 1088 1091 1097 1110 1152 1152 1154 1170 1172 1184 1184 1196 1211 1216 1222 1223 1238 1251 1255 1255 1267 1283 1298 1310 1312 1319 1335 1351 1354 1354 1368 1369 1369 1379 1381 1383 1411 1411 1482'

In [None]:
extra_spectrum = generate_theoretical_spectrum(extra_input)
assert extra_output == display_masses(extra_spectrum)

In [None]:
test_peptide = 'CATMPQCWQAKMMHW'
test_spectrum = generate_theoretical_spectrum(test_peptide)

In [None]:
# 'data/dataset_98_4.txt'
print(display_masses(test_spectrum))

--- 
## Counting Peptides with Given Mass Problem

In [None]:
from collections import defaultdict

def nb_linear_peptides(parent_mass):
    masses = set(INTEGER_MASSES.values())
    to_expand = [(0, [])]
    win_counts = defaultdict(int)
    nb_wins = 0
    losers = set()
    winners = set()
    while to_expand:
        current, history = to_expand.pop()
        if current in winners:
            nb_wins += 1
            continue
        for mass in masses:
            new = current + mass
            if new in losers:
                continue
            if new in winners:
                nb_wins += 1
                continue
            if new == parent_mass:
                nb_wins += 1
                winners.update(history + [current])
                continue
            if new < parent_mass:
                to_expand.append((new, history + [current]))
                continue
            losers.add(new)
    return nb_wins  

In [None]:
sample_input = 1024
sample_output = 14712706211

In [None]:
## %time sample_result = nb_linear_peptides(sample_input)

In [None]:
## assert sample_result == sample_output

In [None]:
extra_input = 1307
extra_output = 34544458837656

In [None]:
## extra_result = nb_linear_peptides(extra_input)

In [None]:
## assert extra_result == extra_output

--- 
## Cyclopeptide Sequencing using a Branch-and-Bound Algorithm

In [None]:
from collections import deque

def cyclospectrum(cyclic_peptide_masses):
    peptide = deque(cyclic_peptide_masses)
    counter = 0
    spectrum = [0, sum(cyclic_peptide_masses)]
    while counter < len(peptide):
        mass = 0
        peptide_ = list(peptide)
        for m in peptide_[:-1]:
            mass += m
            spectrum.append(mass)
        counter += 1
        peptide.rotate()
    return sorted(spectrum)

In [None]:
from collections import Counter

def is_consistent(small, big):
    if (set(big) < set(small)):
        return False
    small_counter = Counter(small)
    big_counter = Counter(big)
    count_checks = [
        (small_count <= big_counter[item]) 
        for item, small_count in small_counter.items()
    ]
    return all(count_checks)

    CyclopeptideSequencing(Spectrum)
        CandidatePeptides ← a set containing only the empty peptide
        FinalPeptides ← empty list of strings
        while CandidatePeptides is nonempty
            CandidatePeptides ← Expand(CandidatePeptides)
            for each peptide Peptide in CandidatePeptides
                if Mass(Peptide) = ParentMass(Spectrum)
                    if Cyclospectrum(Peptide) = Spectrum and Peptide is not in ﻿FinalPeptides
                        append Peptide to FinalPeptides
                    remove Peptide from CandidatePeptides
                else if Peptide is not consistent with Spectrum
                    remove Peptide from CandidatePeptides
        return FinalPeptides

In [None]:
def cyclopeptide_sequencing(experimental_spectrum):
    masses = set(INTEGER_MASSES.values())
    masses.intersection_update(experimental_spectrum)
    parent_mass = max(experimental_spectrum)
    candidate_peptides = [[0, m] for m in masses if m <= parent_mass]
    final_peptides = []
    while candidate_peptides:
        current_candidate = candidate_peptides.pop()
        current_expansion = [
            current_candidate + [amino_acid]
            for amino_acid in masses
        ]
        for peptide in current_expansion:
            if sum(peptide) == parent_mass:
                if (
                    cyclospectrum(peptide[1:]) == experimental_spectrum 
                    and peptide not in final_peptides
                ):
                    final_peptides.append(peptide[1:])
                continue
            if not is_consistent(peptide, experimental_spectrum):
                continue
            candidate_peptides.append(peptide)
    return final_peptides

In [None]:
def prepare_input(input_):
    return list(map(int, input_.split(' ')))

def format_output(output_):
    return ' '.join(['-'.join(map(str, element)) for element in output_])

In [None]:
sample_input_ = '0 113 128 186 241 299 314 427'
sample_output = '186-128-113 186-113-128 128-186-113 128-113-186 113-186-128 113-128-186'
sample_input = prepare_input(sample_input_)

sample_result_ = cyclopeptide_sequencing(sample_input)
sample_result = format_output(sample_result_)

assert sorted(sample_result.split(' ')) == sorted(sample_output.split(' '))

In [None]:
extra_input_ = '0 71 97 99 103 113 113 114 115 131 137 196 200 202 208 214 226 227 228 240 245 299 311 311 316 327 337 339 340 341 358 408 414 424 429 436 440 442 453 455 471 507 527 537 539 542 551 554 556 566 586 622 638 640 651 653 657 664 669 679 685 735 752 753 754 756 766 777 782 782 794 848 853 865 866 867 879 885 891 893 897 956 962 978 979 980 980 990 994 996 1022 1093'
extra_input = prepare_input(extra_input_)
extra_output = '103-137-71-131-114-113-113-115-99-97 103-97-99-115-113-113-114-131-71-137 113-113-114-131-71-137-103-97-99-115 113-113-115-99-97-103-137-71-131-114 113-114-131-71-137-103-97-99-115-113 113-115-99-97-103-137-71-131-114-113 114-113-113-115-99-97-103-137-71-131 114-131-71-137-103-97-99-115-113-113 115-113-113-114-131-71-137-103-97-99 115-99-97-103-137-71-131-114-113-113 131-114-113-113-115-99-97-103-137-71 131-71-137-103-97-99-115-113-113-114 137-103-97-99-115-113-113-114-131-71 137-71-131-114-113-113-115-99-97-103 71-131-114-113-113-115-99-97-103-137 71-137-103-97-99-115-113-113-114-131 97-103-137-71-131-114-113-113-115-99 97-99-115-113-113-114-131-71-137-103 99-115-113-113-114-131-71-137-103-97 99-97-103-137-71-131-114-113-113-115'

In [None]:
%%time
## extra_result_ = cyclopeptide_sequencing(extra_input)
## extra_result = format_output(extra_result_)

In [None]:
# assert sorted(extra_result.split(' ')) == sorted(extra_output.split(' '))

In [None]:
test_input_ = '0 71 71 71 87 99 101 114 114 142 142 172 185 186 200 201 213 228 243 256 271 287 299 300 314 315 327 342 358 370 386 401 413 414 428 429 441 457 472 485 500 515 527 528 542 543 556 586 586 614 614 627 629 641 657 657 657 728'
test_input = prepare_input(test_input_)

In [None]:
%%time
## test_result_ = cyclopeptide_sequencing(test_input)

In [None]:
## test_result = format_output(test_result_)
## print(test_result)

---