In [81]:
import sys
import os

# Get current notebook directory
current_dir = os.getcwd()

# Add parent directory
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
import peptide
import math

In [82]:
amino_acid_masses = {
        "A": 71.03711,   # Alanine
        "R": 156.10111,  # Arginine
        "N": 114.04293,  # Asparagine
        "D": 115.02694,  # Aspartic acid
        "C": 103.00919,  # Cysteine
        "E": 129.04259,  # Glutamic acid
        "Q": 128.05858,  # Glutamine
        "G": 57.02146,   # Glycine
        "H": 137.05891,  # Histidine
        "I": 113.08406,  # Isoleucine
        "L": 113.08406,  # Leucine
        "K": 128.09496,  # Lysine
        "M": 131.04049,  # Methionine
        "F": 147.06841,  # Phenylalanine
        "P": 97.05276,   # Proline
        "S": 87.03203,   # Serine
        "T": 101.04768,  # Threonine
        "W": 186.07931,  # Tryptophan
        "Y": 163.06333,  # Tyrosine
        "V": 99.06841    # Valine
    }

In [83]:
def create_fake_pairs(the_peptide):
    the_pep = peptide.Pep(the_peptide)
    pair_result = []
    result = []
    for i in range(1, len(the_pep.AA_array)):
        frag1 = the_pep.AA_array[:i]
        frag2 = the_pep.AA_array[i:]
        mass1 = sum([i.get_mass() for i in frag1])
        mass2 = sum([i.get_mass() for i in frag2])
        mass2 += 18.01056  # Adding H2O mass to the second fragment
        print(f"Fragment 1: {frag1}, Mass: {mass1:.4f}")
        print(f"Fragment 2: {frag2}, Mass: {mass2:.4f}")
        pair_result.append([mass1, mass2]) 
        result.append(mass1)
        result.append(mass2)
    result.sort()
    
    return result, pair_result

In [84]:
seq = "[CGEYFQ+2H]2+"
sorted_array, pair_array = create_fake_pairs(seq)

Fragment 1: [C], Mass: 103.0092
Fragment 2: [G, E, Y, F, Q], Mass: 642.2649
Fragment 1: [C, G], Mass: 160.0307
Fragment 2: [E, Y, F, Q], Mass: 585.2435
Fragment 1: [C, G, E], Mass: 289.0732
Fragment 2: [Y, F, Q], Mass: 456.2009
Fragment 1: [C, G, E, Y], Mass: 452.1366
Fragment 2: [F, Q], Mass: 293.1375
Fragment 1: [C, G, E, Y, F], Mass: 599.2050
Fragment 2: [Q], Mass: 146.0691


In [85]:
sorted_array = [0.0] + sorted_array + [sum(pair_array[0])]

In [86]:
sorted_array

[0.0,
 103.00919,
 146.06914,
 160.03065,
 289.07324,
 293.13755,
 452.13657,
 456.20088000000004,
 585.24347,
 599.20498,
 642.26493,
 745.27412]

In [87]:
mid_point = len(sorted_array) // 2
lower_half = sorted_array[:mid_point]
upper_half = sorted_array[mid_point:]
len(lower_half), len(upper_half)

(6, 6)

In [88]:
def generate_psp_grid(spectrum):
    # Ensure the spectrum is sorted to match the index logic
    S = sorted(list(spectrum))
    n = len(S)
    
    # Create an n x n grid
    # We iterate rows (i) then columns (j)
    # Label logic: (Value of Column j, Value of Row i)
    grid = [[(S[j], S[i]) for j in range(n)] for i in range(n)]
    
    return grid


In [89]:
generate_psp_grid(lower_half)

[[(0.0, 0.0),
  (103.00919, 0.0),
  (146.06914, 0.0),
  (160.03065, 0.0),
  (289.07324, 0.0),
  (293.13755, 0.0)],
 [(0.0, 103.00919),
  (103.00919, 103.00919),
  (146.06914, 103.00919),
  (160.03065, 103.00919),
  (289.07324, 103.00919),
  (293.13755, 103.00919)],
 [(0.0, 146.06914),
  (103.00919, 146.06914),
  (146.06914, 146.06914),
  (160.03065, 146.06914),
  (289.07324, 146.06914),
  (293.13755, 146.06914)],
 [(0.0, 160.03065),
  (103.00919, 160.03065),
  (146.06914, 160.03065),
  (160.03065, 160.03065),
  (289.07324, 160.03065),
  (293.13755, 160.03065)],
 [(0.0, 289.07324),
  (103.00919, 289.07324),
  (146.06914, 289.07324),
  (160.03065, 289.07324),
  (289.07324, 289.07324),
  (293.13755, 289.07324)],
 [(0.0, 293.13755),
  (103.00919, 293.13755),
  (146.06914, 293.13755),
  (160.03065, 293.13755),
  (289.07324, 293.13755),
  (293.13755, 293.13755)]]

In [90]:
import pandas as pd

def generate_psp_dataframe(spectrum):
    S = sorted(list(spectrum))
    n = len(S)
    
    # Generate data with the logic (S[col], S[row])
    data = [[(S[j], S[i]) for j in range(n)] for i in range(n)]
    
    # Create DataFrame with headers for clarity
    df = pd.DataFrame(data, index=S, columns=S)
    
    return df

df = generate_psp_dataframe(lower_half)
df

Unnamed: 0,0.00000,103.00919,146.06914,160.03065,289.07324,293.13755
0.0,"(0.0, 0.0)","(103.00919, 0.0)","(146.06914, 0.0)","(160.03065, 0.0)","(289.07324, 0.0)","(293.13755, 0.0)"
103.00919,"(0.0, 103.00919)","(103.00919, 103.00919)","(146.06914, 103.00919)","(160.03065, 103.00919)","(289.07324, 103.00919)","(293.13755, 103.00919)"
146.06914,"(0.0, 146.06914)","(103.00919, 146.06914)","(146.06914, 146.06914)","(160.03065, 146.06914)","(289.07324, 146.06914)","(293.13755, 146.06914)"
160.03065,"(0.0, 160.03065)","(103.00919, 160.03065)","(146.06914, 160.03065)","(160.03065, 160.03065)","(289.07324, 160.03065)","(293.13755, 160.03065)"
289.07324,"(0.0, 289.07324)","(103.00919, 289.07324)","(146.06914, 289.07324)","(160.03065, 289.07324)","(289.07324, 289.07324)","(293.13755, 289.07324)"
293.13755,"(0.0, 293.13755)","(103.00919, 293.13755)","(146.06914, 293.13755)","(160.03065, 293.13755)","(289.07324, 293.13755)","(293.13755, 293.13755)"


In [114]:
def find_peptide_paths(spectrum, allowed_masses=None, tolerance=0.02):
    """
    Finds all valid paths from (0,0) in the PSP graph, handling floating point masses.
    
    Args:
        spectrum: A list or set of masses (e.g., {0.0, 57.02, 114.04...})
        allowed_masses: A list of valid jump sizes (e.g., amino acid masses). 
        tolerance: The allowable difference (delta) to consider a match valid.
    """
    
    # 1. Setup
    S = sorted(list(set(spectrum)))
    
    # Helper: Find values in S that are within 'tolerance' of 'target'
    def get_matches_in_spectrum(target_val):
        return [s for s in S if abs(s - target_val) <= tolerance]

    # Find start node (values in S close to 0)
    start_candidates = get_matches_in_spectrum(0.0)
    if not start_candidates:
        print("Warning: No mass close to 0.0 found in spectrum. Cannot start at (0,0).")
        return []
    
    # Start at the actual value found in spectrum (e.g., 0.0 or 0.0001)
    start_node = (start_candidates[0], start_candidates[0])

    # If no masses provided, use float versions of your example
    if allowed_masses is None:
        allowed_masses = [2.0, 3.0, 4.0, 6.0] 

    all_paths = []

    # 2. Recursive DFS Function
    def dfs(current_path):
        current_node = current_path[-1] # (x1, x2)
        x1, x2 = current_node
        current_max = max(x1, x2)
        
        found_extension = False
        
        # Try all possible mass jumps
        for m in allowed_masses:
            
            # --- Check "Down ↓" (Increase x1) ---
            target_x1 = x1 + m
            matches_x1 = get_matches_in_spectrum(target_x1)
            
            for s_next in matches_x1:
                next_node = (s_next, x2)
                
                # Growth Condition: max(new) must be > max(old)
                # With floats, strict inequality is sufficient as long as m > 0
                if max(next_node) <= current_max:
                    continue
                    
                found_extension = True
                dfs(current_path + [next_node])

            # --- Check "Right →" (Increase x2) ---
            target_x2 = x2 + m
            matches_x2 = get_matches_in_spectrum(target_x2)
            
            for s_next in matches_x2:
                next_node = (x1, s_next)
                
                if max(next_node) <= current_max:
                    continue

                found_extension = True
                dfs(current_path + [next_node])

        # If we can't extend further, this path is complete (or dead end)
        if not found_extension:
            all_paths.append(current_path)

    dfs([start_node])
    
    return all_paths

def format_path_string(path, with_aa= False):
    """
    Helper to turn a list of nodes [(0,0), (0,2)...] into the arrow string format
    Rounds numbers for cleaner display.
    """
    if not path: return ""
    
    def fmt_node(n):
        return f"({round(n[0], 3)}, {round(n[1], 3)})"
    
    output = fmt_node(path[0])
    
    for i in range(len(path) - 1):
        curr = path[i]
        next_n = path[i+1]
        
        # Determine direction
        # We use a small epsilon for direction check due to float precision,
        # though standard inequality usually works fine.
        if next_n[0] > curr[0]:
            direction = "Down ↓"
        else:
            direction = "Right →"
        if with_aa:
            if direction == "Down ↓":
                mass = round(next_n[0] - curr[0], 3)
            else:
                mass = round(next_n[1] - curr[1], 3)
            aa = None
            for key, value in amino_acid_masses.items():
                if abs(value - mass) <= 0.05:  # Allow small tolerance for matching
                    aa = key
                    break
            output += f" {direction}({aa}) {fmt_node(next_n)}"
        else:
        
            output += f" {direction} {fmt_node(next_n)}"
        
    return output

# --- Execution with Float Data ---

# 1. Define Spectrum (Simulated Float Data with slight noise)
# Ideally: {0, 2, 5, 6, 9}
# Noisy:   {0.0, 2.01, 5.00, 6.02, 9.01}
S_star = {0.0, 2.01, 5.00, 6.02, 9.01}

# 2. Define valid step sizes 
# We look for steps of size 2, 3, 4, 6 (allowing for tolerance)
example_masses = [2.0, 3.0, 4.0, 6.0]

# 3. Run Algorithm with Tolerance
# Tolerance 0.05 will match 2.01 to 2.0, and 9.01 to 5.00+4.0
paths = find_peptide_paths(S_star, allowed_masses=example_masses, tolerance=0.05)

# 4. Print Results
print(f"Spectrum: {sorted(list(S_star))}")
print(f"Allowed Steps: {example_masses}")
print(f"Tolerance: 0.05\n")
print(f"Found {len(paths)} valid paths:\n")

for p in paths:
    print(format_path_string(p))

Spectrum: [0.0, 2.01, 5.0, 6.02, 9.01]
Allowed Steps: [2.0, 3.0, 4.0, 6.0]
Tolerance: 0.05

Found 12 valid paths:

(0.0, 0.0) Down ↓ (2.01, 0.0) Down ↓ (5.0, 0.0) Down ↓ (9.01, 0.0)
(0.0, 0.0) Down ↓ (2.01, 0.0) Down ↓ (5.0, 0.0) Right → (5.0, 6.02) Right → (5.0, 9.01)
(0.0, 0.0) Down ↓ (2.01, 0.0) Down ↓ (5.0, 0.0) Right → (5.0, 6.02) Down ↓ (9.01, 6.02)
(0.0, 0.0) Down ↓ (2.01, 0.0) Down ↓ (6.02, 0.0) Down ↓ (9.01, 0.0)
(0.0, 0.0) Down ↓ (2.01, 0.0) Right → (2.01, 6.02) Right → (2.01, 9.01)
(0.0, 0.0) Right → (0.0, 2.01) Right → (0.0, 5.0) Right → (0.0, 9.01)
(0.0, 0.0) Right → (0.0, 2.01) Right → (0.0, 5.0) Down ↓ (6.02, 5.0) Down ↓ (9.01, 5.0)
(0.0, 0.0) Right → (0.0, 2.01) Right → (0.0, 5.0) Down ↓ (6.02, 5.0) Right → (6.02, 9.01)
(0.0, 0.0) Right → (0.0, 2.01) Right → (0.0, 6.02) Right → (0.0, 9.01)
(0.0, 0.0) Right → (0.0, 2.01) Down ↓ (6.02, 2.01) Down ↓ (9.01, 2.01)
(0.0, 0.0) Down ↓ (6.02, 0.0) Down ↓ (9.01, 0.0)
(0.0, 0.0) Right → (0.0, 6.02) Right → (0.0, 9.01)


In [115]:
paths

[[(0.0, 0.0), (2.01, 0.0), (5.0, 0.0), (9.01, 0.0)],
 [(0.0, 0.0), (2.01, 0.0), (5.0, 0.0), (5.0, 6.02), (5.0, 9.01)],
 [(0.0, 0.0), (2.01, 0.0), (5.0, 0.0), (5.0, 6.02), (9.01, 6.02)],
 [(0.0, 0.0), (2.01, 0.0), (6.02, 0.0), (9.01, 0.0)],
 [(0.0, 0.0), (2.01, 0.0), (2.01, 6.02), (2.01, 9.01)],
 [(0.0, 0.0), (0.0, 2.01), (0.0, 5.0), (0.0, 9.01)],
 [(0.0, 0.0), (0.0, 2.01), (0.0, 5.0), (6.02, 5.0), (9.01, 5.0)],
 [(0.0, 0.0), (0.0, 2.01), (0.0, 5.0), (6.02, 5.0), (6.02, 9.01)],
 [(0.0, 0.0), (0.0, 2.01), (0.0, 6.02), (0.0, 9.01)],
 [(0.0, 0.0), (0.0, 2.01), (6.02, 2.01), (9.01, 2.01)],
 [(0.0, 0.0), (6.02, 0.0), (9.01, 0.0)],
 [(0.0, 0.0), (0.0, 6.02), (0.0, 9.01)]]

In [116]:
path = find_peptide_paths(lower_half, amino_acid_masses.values())

In [117]:
for p in path:
    print(format_path_string(p, with_aa=True))

(0.0, 0.0) Down ↓(C) (103.009, 0.0) Down ↓(G) (160.031, 0.0) Down ↓(E) (289.073, 0.0)
(0.0, 0.0) Down ↓(C) (103.009, 0.0) Down ↓(W) (289.073, 0.0)
(0.0, 0.0) Right →(C) (0.0, 103.009) Right →(G) (0.0, 160.031) Right →(E) (0.0, 289.073)
(0.0, 0.0) Right →(C) (0.0, 103.009) Right →(W) (0.0, 289.073)


In [142]:
def find_peptide_paths(spectrum, allowed_masses=None, tolerance=0.02, start_point=(0.0, 18.01056)):
    """
    Finds all valid paths in the PSP graph, usually starting at (0, Water_Mass).
    
    Args:
        spectrum: A list or set of masses (e.g., {0.0, 18.01, 75.03...})
        allowed_masses: A list of valid jump sizes (e.g., amino acid masses). 
        tolerance: The allowable difference (delta) to consider a match valid.
        start_point: Tuple (x1, x2) to start search. Default is (0, Mass_H2O).
    """
    
    # 1. Setup
    S = sorted(list(set(spectrum)))
    
    # Helper: Find values in S that are within 'tolerance' of 'target'
    def get_matches_in_spectrum(target_val):
        return [s for s in S if abs(s - target_val) <= tolerance]

    # --- Start Node Logic ---
    target_x1, target_x2 = start_point
    
    matches_x1 = get_matches_in_spectrum(target_x1)
    matches_x2 = get_matches_in_spectrum(target_x2)
    
    if not matches_x1:
        print(f"Warning: Start value x1={target_x1} not found in spectrum (within tol={tolerance}).")
        return []
    if not matches_x2:
        print(f"Warning: Start value x2={target_x2} not found in spectrum (within tol={tolerance}).")
        return []
        
    # Generate all valid start combinations from the fuzzy matches
    start_nodes = [(m1, m2) for m1 in matches_x1 for m2 in matches_x2]

    # If no masses provided, use float versions of your example
    if allowed_masses is None:
        allowed_masses = [57.021, 71.037, 87.032, 97.053] # Gly, Ala, Ser, Pro (examples)

    all_paths = []

    # 2. Recursive DFS Function
    def dfs(current_path):
        current_node = current_path[-1] # (x1, x2)
        x1, x2 = current_node
        current_max = max(x1, x2)
        
        found_extension = False
        
        # Try all possible mass jumps
        for m in allowed_masses:
            
            # --- Check "Down ↓" (Increase x1) ---
            target_x1_next = x1 + m
            matches_x1_next = get_matches_in_spectrum(target_x1_next)
            
            for s_next in matches_x1_next:
                next_node = (s_next, x2)
                
                # Growth Condition: max(new) must be > max(old)
                if max(next_node) <= current_max:
                    continue
                    
                found_extension = True
                dfs(current_path + [next_node])

            # --- Check "Right →" (Increase x2) ---
            target_x2_next = x2 + m
            matches_x2_next = get_matches_in_spectrum(target_x2_next)
            
            for s_next in matches_x2_next:
                next_node = (x1, s_next)
                
                if max(next_node) <= current_max:
                    continue

                found_extension = True
                dfs(current_path + [next_node])

        # If we can't extend further, this path is complete (or dead end)
        if not found_extension:
            all_paths.append(current_path)

    # Launch search from all valid start nodes
    for start_node in start_nodes:
        dfs([start_node])
    
    return all_paths

def format_path_string(path, with_aa= False):
    """
    Helper to turn a list of nodes [(0,0), (0,2)...] into the arrow string format
    Rounds numbers for cleaner display.
    """
    if not path: return ""
    
    def fmt_node(n):
        return f"({round(n[0], 3)}, {round(n[1], 3)})"
    
    output = fmt_node(path[0])
    
    for i in range(len(path) - 1):
        curr = path[i]
        next_n = path[i+1]
        
        # Determine direction
        # We use a small epsilon for direction check due to float precision,
        # though standard inequality usually works fine.
        if next_n[0] > curr[0]:
            direction = "Down ↓"
        else:
            direction = "Right →"
        if with_aa:
            if direction == "Down ↓":
                mass = round(next_n[0] - curr[0], 3)
            else:
                mass = round(next_n[1] - curr[1], 3)
            aa = None
            for key, value in amino_acid_masses.items():
                if abs(value - mass) <= 0.05:  # Allow small tolerance for matching
                    aa = key
                    break
            output += f" {direction}({aa}) {fmt_node(next_n)}"
        else:
        
            output += f" {direction} {fmt_node(next_n)}"
        
    return output


def path_to_seq(path, seq_mass):
    """
    Helper to turn a list of nodes [(0,0), (0,2)...] into the arrow string format
    Rounds numbers for cleaner display.
    """
    if not path: return ""
    forward = []
    backward = []
    middle = None
    def fmt_node(n):
        return f"({round(n[0], 3)}, {round(n[1], 3)})"
    
    output = fmt_node(path[0])
    
    for i in range(len(path) - 1):
        curr = path[i]
        next_n = path[i+1]
        
        # Determine direction
        # We use a small epsilon for direction check due to float precision,
        # though standard inequality usually works fine.
        if next_n[0] > curr[0]:
            direction = "Down ↓"
    
        else:
            direction = "Right →"

        if direction == "Down ↓":
            mass = round(next_n[0] - curr[0], 3)
        else:
            mass = round(next_n[1] - curr[1], 3)
        aa = None
        for key, value in amino_acid_masses.items():
            if abs(value - mass) <= 0.05:  # Allow small tolerance for matching
                aa = key
                break
        output += f" {direction}({aa}) {fmt_node(next_n)}"
        if direction == "Down ↓":
            forward.append(aa)
        else:
            backward.append(aa)
    backward.reverse()
    
    the_middle_diff = seq_mass - (path[-1][0] + path[-1][1])
    for key, value in amino_acid_masses.items():
        if abs(value - the_middle_diff) <= 0.05:
            middle = key
            break

    full_seq = "".join(forward)
    if middle:
        full_seq += middle
    else:
        full_seq += "?"
    full_seq += "".join(backward)
    return full_seq

# --- Execution with Water Mass Start ---

# 1. Define Spectrum (Simulated)
# We need 0.0, 18.01 (Water), and some peaks that result from adding AA masses to them.
# Let's simulate a Glycine (57.02) step.
# 0.0 + 57.02 = 57.02
# 18.01 + 57.02 = 75.03
S_star = {0.0, 18.01056, 57.02146, 75.03202} 

# 2. Define valid step sizes (e.g., Glycine only for this test)
example_masses = [57.02146]


In [125]:
lower_half_modified = lower_half + [18.01056]
lower_half_modified.sort()

In [127]:
# 3. Run Algorithm starting at (0, 18.01)
paths = find_peptide_paths(
    lower_half_modified, 
    allowed_masses=amino_acid_masses.values(), 
    tolerance=0.02,
    start_point=(0.0, 18.01056)
)

# 4. Print Results
print(f"Spectrum: {[round(x,3) for x in sorted(list(S_star))]}")
print(f"Allowed Steps: {example_masses}")
print(f"Start Point: (0, 18.01056)\n")

if not paths:
    print("No paths found. (Did you ensure the start point and steps exist in the spectrum?)")
else:
    print(f"Found {len(paths)} valid paths:\n")
    for p in paths:
        print(format_path_string(p, with_aa=True))

Spectrum: [0.0, 18.011, 57.021, 75.032]
Allowed Steps: [57.02146]
Start Point: (0, 18.01056)

Found 7 valid paths:

(0.0, 18.011) Down ↓(C) (103.009, 18.011) Right →(Q) (103.009, 146.069) Down ↓(G) (160.031, 146.069) Down ↓(E) (289.073, 146.069) Right →(F) (289.073, 293.138)
(0.0, 18.011) Down ↓(C) (103.009, 18.011) Right →(Q) (103.009, 146.069) Down ↓(G) (160.031, 146.069) Right →(F) (160.031, 293.138)
(0.0, 18.011) Down ↓(C) (103.009, 18.011) Right →(Q) (103.009, 146.069) Right →(F) (103.009, 293.138)
(0.0, 18.011) Down ↓(C) (103.009, 18.011) Right →(Q) (103.009, 146.069) Down ↓(W) (289.073, 146.069) Right →(F) (289.073, 293.138)
(0.0, 18.011) Down ↓(C) (103.009, 18.011) Down ↓(G) (160.031, 18.011) Down ↓(E) (289.073, 18.011)
(0.0, 18.011) Down ↓(C) (103.009, 18.011) Down ↓(W) (289.073, 18.011)
(0.0, 18.011) Right →(Q) (0.0, 146.069) Right →(F) (0.0, 293.138)


In [134]:
def get_pep_mass(seq):
    mass= 0
    for i in seq:
        mass += amino_acid_masses.get(i, 0)
    mass += 18.01056  # Adding H2O mass
    return mass
total_mass = get_pep_mass("CGEYFQ")

In [136]:
total_mass - 289.073 - 293.138

163.06312000000008

In [143]:
for p in paths:
    print(path_to_seq(p, total_mass))

CGEYFQ
CG?FQ
C?FQ
CWYFQ
CGE?
CW?
?FQ


In [1]:
import anti_symmetric

## real data

In [1]:
amino_acid_masses = {
        "A": 71.03711,   # Alanine
        "R": 156.10111,  # Arginine
        "N": 114.04293,  # Asparagine
        "D": 115.02694,  # Aspartic acid
        "C": 103.00919,  # Cysteine
        "E": 129.04259,  # Glutamic acid
        "Q": 128.05858,  # Glutamine
        "G": 57.02146,   # Glycine
        "H": 137.05891,  # Histidine
        "I": 113.08406,  # Isoleucine
        "L": 113.08406,  # Leucine
        "K": 128.09496,  # Lysine
        "M": 131.04049,  # Methionine
        "F": 147.06841,  # Phenylalanine
        "P": 97.05276,   # Proline
        "S": 87.03203,   # Serine
        "T": 101.04768,  # Threonine
        "W": 186.07931,  # Tryptophan
        "Y": 163.06333,  # Tyrosine
        "V": 99.06841    # Valine
    }


amino_acid_masses_merge = {
        "A": 71.03711,   # Alanine
        "R": 156.10111,  # Arginine
        "N": 114.04293,  # Asparagine
        "D": 115.02694,  # Aspartic acid
        "C": 103.00919,  # Cysteine
        "E": 129.04259,  # Glutamic acid
        "Q": 128.05858,  # Glutamine
        "G": 57.02146,   # Glycine
        "H": 137.05891,  # Histidine
        "113(I/L)": 113.08406,  # Isoleucine
        #"L": 113.08406,  # Leucine
        "K": 128.09496,  # Lysine
        "M": 131.04049,  # Methionine
        "F": 147.06841,  # Phenylalanine
        "P": 97.05276,   # Proline
        "S": 87.03203,   # Serine
        "T": 101.04768,  # Threonine
        "W": 186.07931,  # Tryptophan
        "Y": 163.06333,  # Tyrosine
        "V": 99.06841    # Valine
    }

def create_fake_pairs(the_peptide):
    the_pep = peptide.Pep(the_peptide)
    pair_result = []
    result = []
    for i in range(1, len(the_pep.AA_array)):
        frag1 = the_pep.AA_array[:i]
        frag2 = the_pep.AA_array[i:]
        mass1 = sum([i.get_mass() for i in frag1])
        mass2 = sum([i.get_mass() for i in frag2])
        mass2 += 18.01056  # Adding H2O mass to the second fragment
        print(f"Fragment 1: {frag1}, Mass: {mass1:.4f}")
        print(f"Fragment 2: {frag2}, Mass: {mass2:.4f}")
        pair_result.append([mass1, mass2]) 
        result.append(mass1)
        result.append(mass2)
    result.sort()
    
    return result, pair_result

def get_pep_mass(seq):
    mass= 0
    for i in seq:
        mass += amino_acid_masses.get(i, 0)
    mass += 18.01056  # Adding H2O mass
    return mass


def find_peptide_paths(spectrum, allowed_masses=None, tolerance=0.02, start_point=(0.0, 18.01056)):
    """
    Finds all valid paths in the PSP graph, usually starting at (0, Water_Mass).
    
    Args:
        spectrum: A list or set of masses (e.g., {0.0, 18.01, 75.03...})
        allowed_masses: A list of valid jump sizes (e.g., amino acid masses). 
        tolerance: The allowable difference (delta) to consider a match valid.
        start_point: Tuple (x1, x2) to start search. Default is (0, Mass_H2O).
    """
    
    # 1. Setup
    S = sorted(list(set(spectrum)))
    
    # Helper: Find values in S that are within 'tolerance' of 'target'
    def get_matches_in_spectrum(target_val):
        return [s for s in S if abs(s - target_val) <= tolerance]

    # --- Start Node Logic ---
    target_x1, target_x2 = start_point
    
    matches_x1 = get_matches_in_spectrum(target_x1)
    matches_x2 = get_matches_in_spectrum(target_x2)
    
    if not matches_x1:
        print(f"Warning: Start value x1={target_x1} not found in spectrum (within tol={tolerance}).")
        return []
    if not matches_x2:
        print(f"Warning: Start value x2={target_x2} not found in spectrum (within tol={tolerance}).")
        return []
        
    # Generate all valid start combinations from the fuzzy matches
    start_nodes = [(m1, m2) for m1 in matches_x1 for m2 in matches_x2]

    # If no masses provided, use float versions of your example
    if allowed_masses is None:
        allowed_masses = [57.021, 71.037, 87.032, 97.053] # Gly, Ala, Ser, Pro (examples)

    all_paths = []

    # 2. Recursive DFS Function
    def dfs(current_path):
        current_node = current_path[-1] # (x1, x2)
        x1, x2 = current_node
        current_max = max(x1, x2)
        
        found_extension = False
        
        # Try all possible mass jumps
        for m in allowed_masses:
            
            # --- Check "Down ↓" (Increase x1) ---
            target_x1_next = x1 + m
            matches_x1_next = get_matches_in_spectrum(target_x1_next)
            
            for s_next in matches_x1_next:
                next_node = (s_next, x2)
                
                # Growth Condition: max(new) must be > max(old)
                if max(next_node) <= current_max:
                    continue
                    
                found_extension = True
                dfs(current_path + [next_node])

            # --- Check "Right →" (Increase x2) ---
            target_x2_next = x2 + m
            matches_x2_next = get_matches_in_spectrum(target_x2_next)
            
            for s_next in matches_x2_next:
                next_node = (x1, s_next)
                
                if max(next_node) <= current_max:
                    continue

                found_extension = True
                dfs(current_path + [next_node])

        # If we can't extend further, this path is complete (or dead end)
        if not found_extension:
            all_paths.append(current_path)

    # Launch search from all valid start nodes
    for start_node in start_nodes:
        dfs([start_node])
    
    return all_paths

def format_path_string(path, with_aa= False):
    """
    Helper to turn a list of nodes [(0,0), (0,2)...] into the arrow string format
    Rounds numbers for cleaner display.
    """
    if not path: return ""
    
    def fmt_node(n):
        return f"({round(n[0], 3)}, {round(n[1], 3)})"
    
    output = fmt_node(path[0])
    
    for i in range(len(path) - 1):
        curr = path[i]
        next_n = path[i+1]
        
        # Determine direction
        # We use a small epsilon for direction check due to float precision,
        # though standard inequality usually works fine.
        if next_n[0] > curr[0]:
            direction = "Down ↓"
        else:
            direction = "Right →"
        if with_aa:
            if direction == "Down ↓":
                mass = round(next_n[0] - curr[0], 3)
            else:
                mass = round(next_n[1] - curr[1], 3)
            aa = None
            for key, value in amino_acid_masses.items():
                if abs(value - mass) <= 0.01:  # Allow small tolerance for matching
                    aa = key
                    break
            output += f" {direction}({aa}) {fmt_node(next_n)}"
        else:
        
            output += f" {direction} {fmt_node(next_n)}"
        
    return output


def path_to_seq(path, seq_mass):
    """
    Helper to turn a list of nodes [(0,0), (0,2)...] into the arrow string format
    Rounds numbers for cleaner display.
    """
    if not path: return ""
    forward = []
    backward = []
    middle = None
    def fmt_node(n):
        return f"({round(n[0], 3)}, {round(n[1], 3)})"
    
    output = fmt_node(path[0])
    
    for i in range(len(path) - 1):
        curr = path[i]
        next_n = path[i+1]
        
        # Determine direction
        # We use a small epsilon for direction check due to float precision,
        # though standard inequality usually works fine.
        if next_n[0] > curr[0]:
            direction = "Down ↓"
    
        else:
            direction = "Right →"

        if direction == "Down ↓":
            mass = round(next_n[0] - curr[0], 3)
        else:
            mass = round(next_n[1] - curr[1], 3)
        aa = None
        for key, value in amino_acid_masses.items():
            if abs(value - mass) <= 0.001:  # Allow small tolerance for matching
                aa = key
                break
        output += f" {direction}({aa}) {fmt_node(next_n)}"
        if direction == "Down ↓":
            forward.append(aa)
        else:
            backward.append(aa)
    backward.reverse()
    
    the_middle_diff = seq_mass - (path[-1][0] + path[-1][1])
    
    for key, value in amino_acid_masses.items():
        if abs(value - the_middle_diff) <= 0.05:
            middle = key
            break

    full_seq = "".join(forward)
    if middle:
        full_seq += middle
    else:
        full_seq += "?"
    full_seq += "".join(backward)
    return full_seq

In [2]:
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
vis_dir = os.path.join(parent_dir, "vis")
connected_graphs_dir = os.path.join(parent_dir, "vis_connect")

sys.path.append(vis_dir)
sys.path.append(connected_graphs_dir)

import data_parse
import util
import peptide
import pandas as pd
import numpy as np
import connected_graph

In [3]:
data = 'ME4_2+'
csv_data = f"{data}.csv"
file_path = f"../data/Top_Correlations_At_Full_Num_Scans_PCov/annotated/{csv_data}"
file_path = os.path.abspath(file_path) 

In [4]:
sequence = util.name_ouput(csv_data)
pep = peptide.Pep(sequence)
the_length = len(pep.AA_array)
csv_data = file_path
df = pd.read_csv(csv_data)
df = df[df['Index'].notna()]
results = data_parse.process_ion_dataframe(df.head(50), pep)
results['classification'] = results.apply(data_parse.data_classify, args=(pep,), axis=1)
the_list = []
the_y_list = []

results['loss1'] = results['loss1'].replace({None: np.nan})
results['loss2'] = results['loss2'].replace({None: np.nan})

  df_current.loc[df_current['type1'] == 'y', ['y_ion', 'y_mz']] = df_current.loc[df_current['type1'] == 'y', ['ion1', 'mass1']].values
  df_current.loc[df_current['type1'] == 'b', ['b_ion', 'b_mz']] = df_current.loc[df_current['type1'] == 'b', ['ion1', 'mass1']].values


In [5]:
df = results
df['ranking'] = df['Index']

In [6]:
df.head(5)

Unnamed: 0,Index,A_raw,ion1,loss1,loss_sign1,charge1,mass1,B_raw,ion2,loss2,...,chosen_sum_from,chosen_sum,type1,type2,y_ion,y_mz,b_ion,b_mz,classification,ranking
0,1.0,b4(1+),b4,,,1+,508.15,y9(1+),y9,,...,m1+m2,1525.89,b,y,y9,1017.74,b4,508.15,usable,1.0
1,2.0,y6(1+),y6,,,1+,685.48,b7(1+),b7,,...,m1+m2,1525.9,y,b,y6,685.48,b7,840.42,usable,2.0
2,3.0,y5(1+),y5,,,1+,571.42,b8-(NH3)(1+),b8,(NH3),...,m1+m2,1508.79,y,b,y5,571.42,b8,937.37,usable,3.0
3,4.0,b6(1+),b6,,,1+,712.4,y7(1+),y7,,...,m1+m2,1525.92,b,y,y7,813.52,b6,712.4,usable,4.0
4,5.0,y5(1+),y5,,,1+,571.42,b8(1+),b8,,...,m1+m2,1525.82,y,b,y5,571.42,b8,954.4,usable,5.0


In [7]:
LETTER_ORDER = {ch: i for i, ch in enumerate("abcdefghijklmnopqrstuvwxyz")}
rows = ['Parent','(NH3)','(H2O)', '(NH3)-(H2O)','(H2O)-(NH3)', 'a', '2(H2O)', '2(NH3)', '(H3PO4)']
conserve_line_mass_dict = {'Parent': pep.pep_mass, 'a': pep.pep_mass - 28.0106}

def classify_conserve_line(row):
    the_mass = row['chosen_sum']
    for i in conserve_line_mass_dict:
        if the_mass < conserve_line_mass_dict[i] + 1 and the_mass > conserve_line_mass_dict[i] - 1:
            return i
    else:
        return None

df['conserve_line'] = df.apply(classify_conserve_line, axis = 1)

In [8]:
data = 'ME4_2+'
my_peaks, sequence, pep = connected_graph.build_mass_list(data)

  df_current.loc[df_current['type1'] == 'y', ['y_ion', 'y_mz']] = df_current.loc[df_current['type1'] == 'y', ['ion1', 'mass1']].values
  df_current.loc[df_current['type1'] == 'b', ['b_ion', 'b_mz']] = df_current.loc[df_current['type1'] == 'b', ['ion1', 'mass1']].values


In [9]:
my_peaks

[273.18005400000004,
 344.133164,
 386.264114,
 499.348174,
 507.19649400000003,
 564.217954,
 570.385284,
 684.4282139999999,
 711.2863639999999,
 812.4867939999999,
 839.3449439999999,
 953.3878739999999,
 959.5552039999999,
 1016.5766639999999,
 1024.424984,
 1137.509044,
 1179.639994,
 1250.5931039999998]

In [10]:
sequence

'ME4_2+: [LGE(nitro)YGFQNAILVR+2H]2+'

In [11]:
sorted_array = [0.0] + my_peaks + [pep.seq_mass]

In [12]:
mid_point = len(sorted_array) // 2
lower_half = sorted_array[:mid_point]
upper_half = sorted_array[mid_point:]
lower_half_modified = lower_half + [18.01056]
lower_half_modified.sort()

In [13]:
lower_half_modified

[0.0,
 18.01056,
 273.18005400000004,
 344.133164,
 386.264114,
 499.348174,
 507.19649400000003,
 564.217954,
 570.385284,
 684.4282139999999,
 711.2863639999999]

In [14]:
AA_MASSES = connected_graph.AA_MASSES
DOUBLE_AA_MASSES = connected_graph.DOUBLE_AA_MASSES
TRIPLE_AA_MASSES = connected_graph.TRIPLE_AA_MASSES
QUADRA_AA_MASSES = connected_graph.QUADRA_AA_MASSES

In [15]:
allowed_mass_list = list(AA_MASSES.values()) + list(DOUBLE_AA_MASSES.values()) + list(TRIPLE_AA_MASSES.values()) #+ list(QUADRA_AA_MASSES.values())
merge_close_values = connected_graph.merge_close_values
allowed_mass_list = merge_close_values(allowed_mass_list, 0.01)



In [16]:
paths = find_peptide_paths(
    lower_half_modified, 
    allowed_masses=allowed_mass_list, 
    tolerance=0.01,
    start_point=(0.0, 18.01056)
)

In [17]:
def format_path_string_no_aa(path, with_aa= False):
    """
    Helper to turn a list of nodes [(0,0), (0,2)...] into the arrow string format
    Rounds numbers for cleaner display.
    """
    if not path: return ""
    
    def fmt_node(n):
        return f"({round(n[0], 3)}, {round(n[1], 3)})"
    
    output = fmt_node(path[0])
    
    for i in range(len(path) - 1):
        curr = path[i]
        next_n = path[i+1]
        
        # Determine direction
        # We use a small epsilon for direction check due to float precision,
        # though standard inequality usually works fine.
        if next_n[0] > curr[0]:
            direction = "Down ↓"
        else:
            direction = "Right →"
        if with_aa:
            if direction == "Down ↓":
                mass = round(next_n[0] - curr[0], 3)
            else:
                mass = round(next_n[1] - curr[1], 3)
            aa = None
            for key, value in amino_acid_masses.items():
                if abs(value - mass) <= 0.05:  # Allow small tolerance for matching
                    aa = key
                    break
            output += f" {direction}({aa}) {fmt_node(next_n)}"
        else:
        
            output += f" {direction} {fmt_node(next_n)}"
        
    return output

In [18]:
for p in paths:
    print(format_path_string_no_aa(p, with_aa=True))

(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(I) (0.0, 499.348) Right →(A) (0.0, 570.385) Right →(N) (0.0, 684.428)
(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(I) (0.0, 499.348) Right →(A) (0.0, 570.385) Right →(N) (0.0, 684.428)
(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(I) (0.0, 499.348) Right →(None) (0.0, 684.428)
(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(I) (0.0, 499.348) Right →(None) (0.0, 684.428)
(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(I) (0.0, 499.348) Right →(None) (0.0, 684.428)
(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(None) (0.0, 570.385) Right →(N) (0.0, 684.428)
(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(None) (0.0, 570.385) Right →(N) (0.0, 684.428)
(0.0, 18.011) Right →(None) (0.0, 273.18) Right →(I) (0.0, 386.264) Right →(None) (0.0, 684.4

In [19]:
the_max = max([len(p) for p in paths])
the_max_length_num = sum([1 for p in paths if len(p) == the_max])
the_max_length_paths = [p for p in paths if len(p) == the_max]

the_max_length_pep = set([format_path_string_no_aa(p) for p in paths if len(p) == the_max])


print(len(paths), "paths found.", "Max length:", the_max, 'There are', len(the_max_length_pep), "paths of max length.")
print("Max length paths:")

for p in the_max_length_pep:
    print(p)

1156 paths found. Max length: 10 There are 1 paths of max length.
Max length paths:
(0.0, 18.011) Right → (0.0, 273.18) Down ↓ (344.133, 273.18) Right → (344.133, 386.264) Right → (344.133, 499.348) Down ↓ (507.196, 499.348) Down ↓ (564.218, 499.348) Right → (564.218, 570.385) Right → (564.218, 684.428) Down ↓ (711.286, 684.428)


In [44]:
# Combine all mass maps into one dictionary
MASTER_MASS_MAP = {
    **AA_MASSES,
    **DOUBLE_AA_MASSES,
    **TRIPLE_AA_MASSES,
}

def find_possible_labels(target_mass: float, tol: float = 0.01):
    """
    Given a mass, return all keys whose values fall within ±tol.

    Returns:
        List of (label, mass) pairs.
    """
    matches = []
    for label, mass in MASTER_MASS_MAP.items():
        if abs(mass - target_mass) <= tol:
            matches.append((label, mass))
    if len(matches) == 0:
        #return ['?']
        return (target_mass)
    else:
        return tuple(matches)

In [45]:
def path_to_seq_complex(path, seq_mass, thresh = 0.01):
    """
    Helper to turn a list of nodes [(0,0), (0,2)...] into the arrow string format
    Rounds numbers for cleaner display.
    """
    if not path: return ""
    forward = []
    backward = []
    middle = None
    def fmt_node(n):
        return f"({round(n[0], 3)}, {round(n[1], 3)})"
    
    output = fmt_node(path[0])
    
    for i in range(len(path) - 1):
        curr = path[i]
        next_n = path[i+1]
        
        # Determine direction
        # We use a small epsilon for direction check due to float precision,
        # though standard inequality usually works fine.
        if next_n[0] > curr[0]:
            direction = "Down ↓"
    
        else:
            direction = "Right →"

        if direction == "Down ↓":
            mass = round(next_n[0] - curr[0], 3)
        else:
            mass = round(next_n[1] - curr[1], 3)
        
        possible_labels = find_possible_labels(mass, tol=thresh)
        
        output += f" {direction}({possible_labels}) {fmt_node(next_n)}"
        if direction == "Down ↓":
            forward.append(possible_labels)
        else:
            backward.append(possible_labels)
    backward.reverse()
    
    the_middle_diff = seq_mass - (path[-1][0] + path[-1][1])
    
    
    possible_middle_labels = find_possible_labels(the_middle_diff, tol=thresh)
    
    

    #full_seq = "".join(forward)
    #full_seq += f"({possible_middle_labels})"
    #full_seq += "".join(backward)
    
    full_seq = []
    full_seq.extend(forward)
    full_seq.append(possible_middle_labels)
    full_seq.extend(backward)
    
    return tuple(full_seq)

In [48]:
the_max_length_pep = set([path_to_seq_complex(p, pep.seq_mass + 18.01056) for p in paths if len(p) == the_max])


In [50]:
the_max_length_pep


{((('(I/L)+E(nitro)+G', 344.133201),
   ('A+E(nitro)+V', 344.13320100000004),
   ('D+Q+T', 344.13320000000004),
   ('E+N+T', 344.13319900000005),
   ('E+Q+S', 344.133199)),
  (('Y', 163.063329),),
  (('G', 57.021464),),
  (('F', 147.068414),),
  (('Q', 128.058578), ('G+A', 128.058578)),
  (('N', 114.042927), ('G+G', 114.042928)),
  (('A', 71.037114),),
  (('(I/L)', 113.084064),),
  (('(I/L)', 113.084064),),
  (('V+R', 255.16952500000002),))}