In [47]:
import numpy as np


def do_range_expand(peptides, alphabet):
    expand_peptides = []
    for p in range(len(peptides)):
        for j in range(len(alphabet)):
            cur_pep = []
            for t in range(len(peptides[p])):
                cur_pep.append(peptides[p][t])
            cur_pep.append(alphabet[j])
            expand_peptides.append(cur_pep)
    return expand_peptides


def count_peptide_mass(peptide):
    return np.sum(peptide)


def count_parent_mass(spec):
    if (len(spec) == 0):
        return 0
    return np.max(spec)


def generate_sybpeptides(peptide):
    res = []
    res.append(0)
    n = len(peptide)
    extra_peptide = peptide + peptide
    for j in range(1, n):
        for i in range(n):
            cur_subpeptide = sum(extra_peptide[i : i + j])
            res.append(cur_subpeptide)
    res.append(np.sum(peptide))
    return res


def find_score(peptide, spec):
    if (len(peptide) == 0):
        return len(spec)
    pep_spec = generate_sybpeptides(peptide)
    ex_spec = []
    for i in range(len(spec)):
        ex_spec.append(spec[i])
    scor = 0
    for elem in pep_spec:
        if (elem in ex_spec):
            ex_spec.remove(elem)
            scor += 1
    return scor


def do_trim(leaders, spec, N):
    if (len(leaders) <= N):
        return leaders

    sorted_leaders = sorted(leaders, key=lambda p: find_score(p, spec))
    res = sorted_leaders[len(sorted_leaders) - N:]
    return res


def do_leaderboard_cyclopeptide_sequencing(spec, N, alphabet):
    leaders = [[]]
    cur_leader = []
    cur_top_score = 0
    parent_mass = count_parent_mass(spec)
    while (len(leaders) > 0):
        #leaders = do_expand(leaders)
        leaders = do_range_expand(leaders, alphabet)
        next_leaders = []
        for j in range(len(leaders)):
            next_leaders.append(leaders[j])

        for cur_pep in leaders:
            cur_mass = count_peptide_mass(cur_pep)
            if (cur_mass == parent_mass):
                cur_score = find_score(cur_pep, spec)
                if ( cur_score > cur_top_score):
                    cur_leader = cur_pep
                    cur_top_score = cur_score
                    #print("Leader: ", cur_leader, "   ----     ", cur_top_score)
            else:
                if (cur_mass > parent_mass):
                    next_leaders.remove(cur_pep)

        leaders = do_trim(next_leaders, spec, N)
    return cur_leader


def get_alphabet(M, spec):
    differences = []
    for i in range(len(spec) - 1):
        for j in range(i+1, len(spec)):
            differences.append(int((np.abs(spec[i] - spec[j]))))

    alphabet = []
    for pept in spec:
        if (pept >= 57 and pept <= 200):
            alphabet.append(pept)

    for pept in differences:
        if (pept >= 57 and pept <= 200):
            alphabet.append(pept)

    alp_counts = np.zeros(201)
    for pept in alphabet:
        alp_counts[pept] += 1

    sorted_alphabet = sorted(alphabet, key=lambda k: alp_counts[k])
    sorted_unique_alphabet = []
    sorted_unique_alphabet.append(sorted_alphabet[0])
    sorted_unique_alphabet = []
    sorted_unique_alphabet.append(sorted_alphabet[0])
    for i in range(1, len(sorted_alphabet)):
        if (sorted_alphabet[i] not in sorted_unique_alphabet):
            sorted_unique_alphabet.append(sorted_alphabet[i])
    if (len(sorted_unique_alphabet) > M):
        sorted_unique_alphabet = sorted_unique_alphabet[len(sorted_unique_alphabet) - M:]
    return sorted_unique_alphabet


def do_convolution_cyclopeptide_sequencing(M, N, spec):
    alph = get_alphabet(M, spec)
    res = do_leaderboard_cyclopeptide_sequencing(spec, N, alph)
    return res

# def main():
#     file = open('data.txt', 'r')
    
#     m = int(next(file))
#     n = int(next(file))
#     spectrum = [int(s) for s in next(file).strip().split()]
    
#     print(do_convolution_cyclopeptide_sequencing(m, n, spectrum))

def final_result_hw20(M, N, text):
    spec = []
    cur_num = ""
    for j in range(len(text)):
        if(text[j] == " "):
            spec.append(int(cur_num))
            cur_num = ""
        else:
            cur_num += text[j]
    spec.append(int(cur_num))

    res = do_convolution_cyclopeptide_sequencing(M, N, spec)
    st_res = ""
    for r in range(len(res)):
        st_res += str(res[r])
        if (r < len(res) - 1):
            st_res += "-"
    print(st_res)

def main():
    file = open('rosalind_ba4i.txt', 'r')
    
    m = int(next(file))
    n = int(next(file))
    text = next(file).strip()
    final_result_hw20(m, n, text)

In [48]:
if __name__ == "__main__":
    main()

128-163-147-99-186-87-115-147-128-186-186-103-57


In [None]:
97-163-131-129-129-147-57-57-129-113-115-114-128

In [29]:
import numpy as np


def expand(peptides, alphabet):

    expanded_peptides = []
    for k in range(len(peptides)):
        for i in range(len(alphabet)):
            cur_pep = []

            for t in range(len(peptides[k])):
                cur_pep.append(peptides[k][t])

            cur_pep.append(alphabet[i])
            expanded_peptides.append(cur_pep)
    return expanded_peptides


def mass(peptide):
    return np.sum(peptide)


def theoretical_spectrum(peptide):
    result = [0]

    double_peptide = peptide + peptide

    for k in range(1, len(peptide)):
        for i in range(len(peptide)):
            curr_subpeptide = sum(double_peptide[i : i + k])
            result.append(curr_subpeptide)

    result.append(np.sum(peptide))
    return result


def score(peptide, spectrum):
    if len(peptide) == 0:
        return len(spectrum)

    subpeptides_set = theoretical_spectrum(peptide)

    temp_spectrum = spectrum.copy()

    curr_score = 0
    for elem in subpeptides_set:
        if elem in temp_spectrum:
            temp_spectrum.remove(elem)
            curr_score += 1

    return curr_score


def trim(leaderboard, spec, n):
    if len(leaderboard) <= n:
        return leaderboard

    sorted_leaders = sorted(leaderboard, key=lambda p: score(p, spec))
    trimmed_leaderboard = sorted_leaders[len(sorted_leaders) - n:]

    return trimmed_leaderboard


def find_amino_alphabet(m, spectrum):
    differences = []
    for i in range(len(spectrum) - 1):
        for j in range(i+1, len(spectrum)):
            differences.append(int((np.abs(spectrum[i] - spectrum[j]))))

    alphabet = []

    for peptide in spectrum:
        if peptide >= 57 and peptide <= 200:
            alphabet.append(peptide)

    for peptide in differences:
        if peptide >= 57 and peptide <= 200:
            alphabet.append(peptide)

    alphabet_counts = np.zeros(201)
    for peptide in alphabet:
        alphabet_counts[peptide] += 1

    unique_sorted_alphabet = sorted(np.unique(alphabet), key=lambda k: alphabet_counts[k])
    if len(unique_sorted_alphabet) > m:
        unique_sorted_alphabet = unique_sorted_alphabet[len(unique_sorted_alphabet) - m:]

    return unique_sorted_alphabet


def leaderboard_cyclopeptide_sequencing(spectrum, n, alphabet):
    leaderboard = [[]]
    leader = []
    leader_score = 1

    parent_mass = max(spectrum)

    while len(leaderboard):

        leaderboard = expand(leaderboard, alphabet)
        next_leaderboard = leaderboard.copy()

        for peptide in leaderboard:
            if mass(peptide) == parent_mass:
                curr_score = score(peptide, spectrum)
                if curr_score > leader_score:
                    leader = peptide
                    leader_score = curr_score
            elif mass(peptide) > parent_mass:
                next_leaderboard.remove(peptide)

        leaderboard = trim(next_leaderboard, spectrum, n)

    return '-'.join([str(n) for n in leader])


def convolution_cyclopeptide_sequencing(m, n, spectrum):
    alphabet = find_amino_alphabet(m, spectrum)
    return leaderboard_cyclopeptide_sequencing(spectrum, n, alphabet)


def main():
    file = open('rosalind_ba4i.txt', 'r')
    
    m = int(next(file))
    n = int(next(file))
    spectrum = [int(s) for s in next(file).strip().split()]
    
    print(convolution_cyclopeptide_sequencing(m, n, spectrum))

In [30]:
if __name__ == "__main__":
    main()

KeyboardInterrupt: 

In [49]:
def main():
    
    file = open('rosalind_ba4i.txt', 'r')
    
    alphabet_size = int(next(file))
    lb_size = int(next(file))
    spectrum = [int(s) for s in next(file).strip().split()]
    
#     alphabet_size = int(input())
#     lb_size = int(input())
#     spectrum = list(map(int, input().split()))

    amino_alphabet = calculate_amino_alphabet(spectrum, alphabet_size)
    result = sequence_peptide(spectrum, lb_size, amino_alphabet)
    print('-'.join(list(map(str, result))))


def _attach_amino_mass(peptide: tuple, amino_alphabet: list) -> list:
    """
    "Attach" all possible amino acid masses to the given peptide and return the resulting peptides
    """
    return [peptide + (mass,) for mass in amino_alphabet]


def _cyclospectrum_score(peptide: tuple, spectrum: dict) -> int:
    """
    Calculate the cyclospectrum score for the given peptide
    """
    cyclospectrum = {}
    for cycle_len in range(1, len(peptide)):
        peptide_extended = peptide + peptide[:(cycle_len - 1)]
        for cycle_start_pos in range(len(peptide)):
            current_mass = _peptide_mass(peptide_extended[cycle_start_pos:(cycle_start_pos + cycle_len)])
            cyclospectrum[current_mass] = cyclospectrum.get(current_mass, 0) + 1

    current_mass = _peptide_mass(peptide)
    cyclospectrum[current_mass] = cyclospectrum.get(current_mass, 0) + 1

    result = 0
    for mass in cyclospectrum.keys():
        result += spectrum.get(mass, 0)  # spectrum.get(mass, 0) * cyclospectrum[mass]
    return result


def _peptide_mass(peptide: tuple):
    """
    Calculate peptide mass
    """
    return sum(peptide)


def _expand_leaderboard(leaderboard: list, spectrum: dict, amino_alphabet: list):
    """
    Expand the leaderboard
    :param leaderboard: leaderboard of peptides
    :param spectrum: spectrum of the peptide to find
    :param amino_alphabet: an alphabet of amino acids to use to expand the leaderboard
    :return: a new leaderboard (noncut), ordered DESC
    """
    lb_len = len(leaderboard)
    for i in range(lb_len):
        p_pair = leaderboard[i]
        new_peptides = _attach_amino_mass(p_pair[1], amino_alphabet)
        for new_peptide in new_peptides:
            leaderboard.append([_cyclospectrum_score(new_peptide, spectrum), new_peptide])

    for i in range(lb_len):
        leaderboard.pop(0)

    leaderboard.sort(key=lambda p: p[0], reverse=True)


def _build_spectrum(spectrum: list) -> dict:
    """
    Convert a list-spectrum into a map-spectrum
    """
    result = {}
    for amino in spectrum:
        result[amino] = 1
    return result


def sequence_peptide(spectrum: list, lb_size: int, amino_alphabet: list) -> list:
    """
    Sequence a peptide using a leaderboad algorithm
    :param spectrum: spectrum of a peptide (not an ideal one)
    :param lb_size: leaderboard size
    :param amino_alphabet: alphabet of amino acids
    :return: the leading peptide
    """
    parent_mass = spectrum[-1]
    spectrum = _build_spectrum(spectrum)

    leaderboard = [[0, ()]]
    leader = [0, ()]

    while len(leaderboard) > 0:
        _expand_leaderboard(leaderboard, spectrum, amino_alphabet)

        i = 0
        while i < len(leaderboard):
            p_pair = leaderboard[i]
            peptide_mass = _peptide_mass(p_pair[1])

            if peptide_mass == parent_mass:
                if p_pair[0] > leader[0]:
                    leader = p_pair
                else:
                    leaderboard.pop(i)
                    continue
            elif peptide_mass > parent_mass:
                leaderboard.pop(i)
                continue

            i += 1
        #print(leaderboard)

        leaderboard_result = []
        j = 0
        while j < len(leaderboard):
            leaderboard_result.append(leaderboard[j])
            j += 1
            if j == len(leaderboard):
                break
            if j >= lb_size and leaderboard[j - 1][0] > leaderboard[j][0]:
                break

        leaderboard = leaderboard_result
        #print(leaderboard)

    return leader[1]


def calculate_amino_alphabet(spectrum: list, alphabet_size: int) -> list:
    """
    Calculate an alphabet of amino acids using convolutions method over the given 'spectrum'
    """
    alphabet_set = {}
    spectrum = spectrum[:]
    spectrum.insert(0, 0)

    for i in range(len(spectrum)):
        c1 = spectrum[i]
        for j in range(i, len(spectrum)):
            if i == j:
                continue
            c2 = spectrum[j]
            conv = c2 - c1
            if 57 <= conv <= 200:
                alphabet_set[conv] = alphabet_set.get(conv, 0) + 1

    alphabet_list = []
    for k in alphabet_set.keys():
        v = alphabet_set.get(k)
        alphabet_list.append((v, k))
    alphabet_list.sort(key=lambda p: p[0], reverse=True)

    result = []
    i = 0
    while i < len(alphabet_list):
        result.append(alphabet_list[i][1])
        i += 1
        if i == len(alphabet_list):
            break
        if i >= alphabet_size and alphabet_list[i - 1][0] > alphabet_list[i][0]:
            break

    return result


if __name__ == '__main__':
    main()


103-57-128-163-147-99-186-87-115-147-128-186-186


In [None]:
Output:
113-115-114-128-97-163-131-129-129-147-57-57-129
97-163-131-129-129-147-57-57-129-113-115-114-128

In [50]:
from collections import Counter, defaultdict
import numpy as np
import numba
from tqdm import tqdm

aminoacid_lst = np.array([
    57, 71, 87, 97, 99, 101, 103, 
    113, 114, 115, 128, 129, 131, 
    137, 147, 156, 163, 186 
])  


#@numba.jit(nopython=True)
def expand(leaderboard, allowed_aminoacids):
    expanded_leaderboard = []
    for peptide in leaderboard:
        for aminoacid in allowed_aminoacids:
            expanded_leaderboard.append(peptide + [aminoacid])
    return expanded_leaderboard


#@numba.jit(nopython=True)
def mass(peptide):
    return sum(peptide)


#@numba.jit(nopython=True)
def parent_mass(spectrum):
    return spectrum[-1]


#@numba.jit(nopython=True)
def cyclospectrum(peptide):
    selfconcat = peptide + peptide
    spectrum = [0]
    for l in range(1, len(peptide)):
        for i in range(len(peptide)):
            spectrum.append(mass(selfconcat[i : i + l]))
    spectrum.append(mass(peptide))
    return sorted(spectrum)


def linspectrum(peptide):
    spectrum = [0] 
    for l in range(1, len(peptide)):
        for i in range(len(peptide) - l + 1):
            spectrum.append(mass(peptide[i : i + l]))
    spectrum.append(mass(peptide))
    return sorted(spectrum)


def score(peptide, spectrum):
    peptide_counts = Counter(cyclospectrum(peptide))
    theor_counts = Counter(spectrum)
    return np.sum([
        min(peptide_counts[base], theor_counts[base])
        for base in peptide_counts.keys()
    ])


def cut(leaderboard, spectrum, n):
    scores = [score(peptide, spectrum) for peptide in leaderboard]
    candidates = sorted(
        leaderboard, 
        key=lambda peptide: score(peptide, spectrum), 
        reverse=True
    )[:n]
    for i in range(len(leaderboard)):
        if scores[i] == score(candidates[-1], spectrum):
            if leaderboard[i] not in candidates:
                candidates.append(leaderboard[i])
    return candidates


def peptide_repr(peptide):
    return '-'.join([str(x) for x in peptide])


def convolve(spectrum):
    return sorted(
        Counter([
            abs(spectrum[i] - spectrum[j]) 
            for i in range(len(spectrum) - 1) 
            for j in range(i + 1, len(spectrum))
        ]).items(), key=lambda p: p[1], reverse=True
    )


#@numba.jit(nopython=True)
def leaderboard_cyclopeptide_sequencing(spectrum, n, allowed_aminoacids):
    leaderboard = [[]]
    leader_peptide = []
    used = defaultdict(bool)
    while leaderboard:
        leaderboard = expand(leaderboard, allowed_aminoacids)
        candidate_lst = []
        for peptide in leaderboard:
            used[peptide_repr] = True
            if mass(peptide) == parent_mass(spectrum):
                if score(peptide, spectrum) > score(leader_peptide, spectrum):
                    leader_peptide = peptide
                    print(peptide_repr(leader_peptide))
            elif mass(peptide) < parent_mass(spectrum) and not used[peptide_repr(peptide)]:
                candidate_lst.append(peptide)
        leaderboard = cut(candidate_lst, spectrum, n)
    return leader_peptide


def convolution_cyclopeptide_sequencing(spectrum, n, m):
    spectrum_conv = convolve(spectrum)
    i = 0
    allowed_aminoacids = set()
    while True:
        mass = spectrum_conv[i][0]
        if 57 <= mass  <= 200:
            allowed_aminoacids.add(mass)
        if len(allowed_aminoacids) == m:
            allowed_aminoacids.update([
                counter_item[0]
                for counter_item in spectrum_conv[i + 1:]
                if (counter_item[1] == spectrum_conv[i][1] 
                    and 57 <= counter_item[0] <= 200)
            ])
            break
        i += 1
    return leaderboard_cyclopeptide_sequencing(
        spectrum, n, allowed_aminoacids
    )

file = open('rosalind_ba4i.txt', 'r')
    
m = int(next(file))
n = int(next(file))
spectrum = sorted([int(s) for s in next(file).strip().split()])

print(convolution_cyclopeptide_sequencing(spectrum, n, m))

99-147-163-128-160-186-186-128-163-186-186
163-128-160-186-186-128-147-115-147-186-186
160-128-163-147-99-186-87-115-147-128-186-186
103-57-128-163-147-99-186-87-115-147-128-186-186
[103, 57, 128, 163, 147, 99, 186, 87, 115, 147, 128, 186, 186]


In [51]:
from collections import Counter
from itertools import chain
from tqdm import tqdm

ACIDS = []
LEADER_PEPTIDE = []


def get_best_with_ties(a, score_function, n):
    a = list(a)
    if len(a) <= n:
        return a
    a = sorted(a, key=lambda x: score_function(x), reverse=True)
    score_threshold = score_function(a[n - 1])
    while n < len(a) and score_function(a[n]) == score_threshold:
        n += 1
    return a[:n]


def most_frequent_convolutions(spectrum, m):
    convolution = []
    n = len(spectrum)
    for i in range(n):
        convolution.extend(abs(spectrum[j] - spectrum[i])
                           for j in range(i + 1, n))
    convolution = list(filter(lambda x: 57 <= x <= 200, convolution))
    c = Counter(convolution)
    return get_best_with_ties(set(convolution), lambda x: c[x], m)


def extend(peptids):
    for p in tqdm(peptids):
        for acid in ACIDS:
            yield p + [acid]


def cyclospectrum(peptide):
    spectrum = [0]
    prefix_sum = [0]
    for amin in peptide:
        prefix_sum.append(prefix_sum[-1] + amin)
    n = len(peptide)
    for i in range(n):
        spectrum.extend(prefix_sum[j] - prefix_sum[i]
                        for j in range(i + 1, n + 1))
        if i > 0:
            spectrum.extend(prefix_sum[n] - prefix_sum[j] + prefix_sum[i]
                            for j in range(i + 1, n))
    return sorted(spectrum)


def score(given_spectrum, peptide):
    spectrum = cyclospectrum(peptide)
    c = Counter(spectrum)
    return sum(min(cnt, given_spectrum[amin]) for amin, cnt in c.items()
               if amin in given_spectrum)


def cut_leaderboard(given_spectrum, peptids, n):
    global LEADER_PEPTIDE
    best = []
    parent_mass = max(given_spectrum)
    for p in list(peptids):
        p_mass = sum(p)
        if p_mass > parent_mass:
            continue
        best.append(p)
        if p_mass < parent_mass:
            continue
        if score(given_spectrum, p) > score(given_spectrum, LEADER_PEPTIDE):
            LEADER_PEPTIDE = p
    return get_best_with_ties(best, lambda p: score(given_spectrum, p), n)


def main():
    file = open('rosalind_ba4i.txt', 'r')
    
    m = int(next(file)) -1
    n = int(next(file))
    given_spectrum = [int(s) for s in next(file).strip().split()]
    global ACIDS
    ACIDS = most_frequent_convolutions(given_spectrum, m)
    given_spectrum_values = Counter(given_spectrum)
    peptids = map(lambda x: [x], ACIDS)
    while peptids:
        peptids = extend(peptids)
        peptids = cut_leaderboard(given_spectrum_values, peptids, n)
    print('-'.join(map(lambda amin: str(amin), LEADER_PEPTIDE)))

if __name__ == '__main__':
    main()


19it [00:00, 30900.26it/s]
100%|██████████| 361/361 [00:00<00:00, 80471.07it/s]
100%|██████████| 511/511 [00:00<00:00, 93658.86it/s]
100%|██████████| 476/476 [00:00<00:00, 36280.66it/s]
100%|██████████| 645/645 [00:00<00:00, 128147.70it/s]
100%|██████████| 376/376 [00:00<00:00, 122833.42it/s]
100%|██████████| 500/500 [00:00<00:00, 112968.76it/s]
100%|██████████| 340/340 [00:00<00:00, 72322.92it/s]
100%|██████████| 458/458 [00:00<00:00, 100244.81it/s]
100%|██████████| 357/357 [00:00<00:00, 129485.17it/s]
100%|██████████| 332/332 [00:00<00:00, 117441.93it/s]
100%|██████████| 369/369 [00:00<00:00, 99084.39it/s]
100%|██████████| 354/354 [00:00<00:00, 9039.17it/s]
100%|██████████| 329/329 [00:00<00:00, 93175.29it/s]
100%|██████████| 365/365 [00:00<00:00, 101412.36it/s]
100%|██████████| 208/208 [00:00<00:00, 79935.43it/s]

103-57-128-163-147-99-186-87-115-147-128-186-186



