In [230]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import re

# Get current notebook directory
current_dir = os.getcwd()

# Add parent directory
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
import peptide
import math
import interpreter_modify

In [198]:
712.52139 * 6

4275.12834

In [199]:
df = pd.read_csv(
    "/Users/kevinmbp/Desktop/2D_spec_dict/data/long_peptide/CovarianceData.NeuropeptideY_Z6_NCE25_300_ions",
    sep=r"\s+",          # any whitespace
    skiprows=1,          
    header=None,
    engine="python"
)

df.columns = ['m/z A', 'm/z B', 'Covariance', 'Partial Cov.', 'Score', 'Ranking']  # rename as you like
df = df[df['Score'] > 0]
df.head()

Unnamed: 0,m/z A,m/z B,Covariance,Partial Cov.,Score,Ranking
0,155.082132,155.082132,56872.291,56726.735,2068.6444,-1
3,155.082132,159.077653,167.26215,102.94098,18.310347,314158
5,155.082132,173.092805,485.99469,454.8254,102.22621,38166
6,155.082132,185.092879,200.25831,39.652668,4.582835,377494
7,155.082132,185.093548,133.94481,58.94333,8.576524,359300


In [175]:
df.head()

Unnamed: 0,mz_x,mz_y,intensity
14513,186.088018,630.395586,1
15223,186.088018,798.239231,2
44610,280.182538,862.268876,3
44609,280.182538,862.018239,4
81094,379.730691,812.24398,5


In [133]:
data = df
data = data.sort_values('Ranking', ascending=True)
data = data[data['Ranking'] != -1]
data["pair_key"] = data.apply(
    lambda row: tuple(sorted([row["m/z A"], row["m/z B"]])),
    axis=1
)
data = data.drop_duplicates(subset="pair_key").drop(columns="pair_key")

In [134]:
#data = data
pep_seq = 'KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK'
charge = 4

In [135]:
data = data[['m/z A', 'm/z B', 'Ranking']]
data.head()

Unnamed: 0,m/z A,m/z B,Ranking
14513,186.088018,630.395586,1
15223,186.088018,798.239231,2
44610,280.182538,862.268876,3
44609,280.182538,862.018239,4
81094,379.730691,812.24398,5


In [136]:
def partition_dataframe_by_charge(df, charge):
    result_df = df.copy()
    x1_col, x2_col = 'm/z A', 'm/z B'
    partitioned_names = []
    
    for w1 in range(1, charge):
        w2 = charge - w1
        
        sum_name = f"{w1}*{x1_col} + {w2}*{x2_col}"
        partitioned_names.append(sum_name)
        
        # Pre-calculate components for easy retrieval later
        result_df[f"comp_{w1}_{x1_col}"] = w1 * df[x1_col]
        result_df[f"comp_{w2}_{x2_col}"] = w2 * df[x2_col]
        
        # Calculate the total sum
        result_df[sum_name] = result_df[f"comp_{w1}_{x1_col}"] + result_df[f"comp_{w2}_{x2_col}"]
        
    return result_df, partitioned_names

def select_best_partition(df, original_cols, target_number, threshold, partitioned_names):
    # Constant for Hydrogen mass
    MASS_H = 1.00784 
    
    # 1. Calculate absolute deviations
    deviations = df[partitioned_names].sub(target_number).abs()
    
    # 2. Get the best column name and minimum deviation per row
    min_deviations = deviations.min(axis=1)
    best_col_names = deviations.idxmin(axis=1)
    
    # 3. Filter rows that exceed the threshold
    mask = min_deviations <= threshold
    result = df.loc[mask, original_cols].copy()
    
    # 4. Extract data for rows that passed the filter
    row_indices = np.where(mask)[0]
    best_cols_filtered = best_col_names[mask].values
    
    # Initialize lists for existing requirements
    selected_totals = []
    comp_x1_vals = []
    comp_x2_vals = []
    
    # Initialize lists for NEW requirements (Charges and Adjusted Masses)
    charge_A_list = []
    charge_B_list = []
    adj_mass_A_list = []
    adj_mass_B_list = []
    
    for idx, col_name in zip(row_indices, best_cols_filtered):
        # Extract weights (charges) n and m using regex
        # This finds '1' and '3' in strings like "1*m/z A + 3*m/z B"
        weights = re.findall(r'(\d+)\*', col_name)
        w1, w2 = int(weights[0]), int(weights[1])
        
        # Get the original component values
        val_A = df.iloc[idx][f"comp_{w1}_m/z A"]
        val_B = df.iloc[idx][f"comp_{w2}_m/z B"]
        
        # Append standard data
        selected_totals.append(df.iloc[idx][col_name])
        comp_x1_vals.append(val_A)
        comp_x2_vals.append(val_B)
        
        # --- NEW LOGIC START ---
        
        # 1. Store the Charges (n and m)
        charge_A_list.append(w1)
        charge_B_list.append(w2)
        
        # 2. Calculate Adjusted Mass based on Charge
        # Logic: Mass - (Charge - 1) * Mass_H
        # If Charge is 1: Mass - (0) * H = Mass
        # If Charge is 2: Mass - (1) * H 
        # If Charge is 3: Mass - (2) * H
        
        mass_A_adj = val_A - (w1 - 1) * MASS_H
        mass_B_adj = val_B - (w2 - 1) * MASS_H
        
        adj_mass_A_list.append(mass_A_adj)
        adj_mass_B_list.append(mass_B_adj)
        
        # --- NEW LOGIC END ---

    # Add standard columns
    result['selected_total'] = selected_totals
    result['component_x1'] = comp_x1_vals
    result['component_x2'] = comp_x2_vals
    #result['source_column'] = best_cols_filtered
    result['deviation'] = min_deviations[mask].values
    
    # Add NEW columns
    result['charge_A'] = charge_A_list
    result['charge_B'] = charge_B_list
    result['adj_mass_A'] = adj_mass_A_list
    result['adj_mass_B'] = adj_mass_B_list
    
    return result.reset_index(drop=True)

In [137]:
pep = peptide.Pep('[KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK+6H]6+', end_h20='NH3')
data, partitioned_names = partition_dataframe_by_charge(data, 6)
data = select_best_partition(data, ['m/z A', 'm/z B', 'Ranking'], pep.pep_mass, 0.1,partitioned_names)


In [138]:
pep.pep_len

37

In [139]:
def annotate_dataframe(df, pep, threshold):
    """
    Annotates the dataframe with:
      - explanation (ion name)
      - deviation (abs difference)
      - theoretical_mass
    for both component A and B.
    """
    
    # --- 1. Pre-calculate all theoretical ions ---
    theoretical_ions = {}
    
    # Range is 1 to pep_len - 1 for standard b/y ions
    for i in range(1, pep.pep_len):
        b_name = f"b{i}"
        y_name = f"y{i}"
        theoretical_ions[b_name] = pep.ion_mass(b_name)
        theoretical_ions[y_name] = pep.ion_mass(y_name)

    # --- 2. Define the matching logic ---
    def find_best_match(observed_mass):
        best_name = None
        best_dev = None
        best_theo = None
        min_diff = float('inf')
        
        for name, theoretical_mass in theoretical_ions.items():
            diff = abs(observed_mass - theoretical_mass)
            
            # Check if within threshold AND closer than any previous match
            if diff <= threshold and diff < min_diff:
                min_diff = diff
                best_name = name
                best_dev = diff
                best_theo = theoretical_mass
        
        return best_name, best_dev, best_theo

    # --- 3. Apply logic to create the 6 new columns ---
    
    # Apply to A
    results_A = df['adj_mass_A'].apply(find_best_match)
    # Extract tuples into separate columns
    df['explanation_A'] = [x[0] for x in results_A]
    df['deviation_A'] = [x[1] for x in results_A]
    df['theoretical_mass_A'] = [x[2] for x in results_A]
    
    # Apply to B
    results_B = df['adj_mass_B'].apply(find_best_match)
    # Extract tuples into separate columns
    df['explanation_B'] = [x[0] for x in results_B]
    df['deviation_B'] = [x[1] for x in results_B]
    df['theoretical_mass_B'] = [x[2] for x in results_B]
    
    return df

In [140]:
data = annotate_dataframe(data, pep, 0.1)

In [141]:
peptide.Pep('[YPSKPDNPGEDAPAEDMARYYSALRHYINLITRQRY+6H]6+', end_h20='NH3').pep_mass

4275.124860000001

In [142]:
712.52139 * 6

4275.12834

In [143]:
import re
import numpy as np

def get_binary_coverage(pep_len, detected_ions):
    """
    Generates a binary array representing peptide coverage from b and y ions.
    
    Args:
        pep_len (int): The total length of the peptide.
        detected_ions (list): A list of ion strings (e.g., ['b1', 'y1', 'b2']).
                              Can accept None or empty strings (will be ignored).
        
    Returns:
        list: A binary list (0s and 1s) of length pep_len.
    """
    # Initialize array of zeros
    coverage = [0] * pep_len
    
    # Regex to capture the type (b or y) and the number
    # Matches "b1", "y12", etc.
    pattern = re.compile(r'([by])(\d+)')

    for ion in detected_ions:
        if not ion: continue  # Skip None or empty strings
        
        match = pattern.match(str(ion))
        if match:
            ion_type, number = match.groups()
            number = int(number)
            
            # Ensure index is within valid range
            if number > pep_len:
                continue

            if ion_type == 'b':
                # b1 -> index 0
                index = number - 1
                if 0 <= index < pep_len:
                    coverage[index] = 1
                    
            elif ion_type == 'y':
                # y1 -> index (len - 1)
                # y2 -> index (len - 2)
                index = pep_len - number
                if 0 <= index < pep_len:
                    coverage[index] = 1

    result = ''
    for i in coverage:
        result += str(i)
    return result

In [209]:
amino_acid_masses = {
        "A": 71.03711,   # Alanine
        "R": 156.10111,  # Arginine
        "N": 114.04293,  # Asparagine
        "D": 115.02694,  # Aspartic acid
        "C": 103.00919,  # Cysteine
        "E": 129.04259,  # Glutamic acid
        "Q": 128.05858,  # Glutamine
        "G": 57.02146,   # Glycine
        "H": 137.05891,  # Histidine
        "I": 113.08406,  # Isoleucine
        "L": 113.08406,  # Leucine
        "K": 128.09496,  # Lysine
        "M": 131.04049,  # Methionine
        "F": 147.06841,  # Phenylalanine
        "P": 97.05276,   # Proline
        "S": 87.03203,   # Serine
        "T": 101.04768,  # Threonine
        "W": 186.07931,  # Tryptophan
        "Y": 163.06333,  # Tyrosine
        "V": 99.06841    # Valine
    }

In [144]:
df = data
the_list = list(df['explanation_A']) + list(df['explanation_B'])
get_binary_coverage(34, the_list)

'0001010010100000001010111111111111'

In [202]:
941.96162 * 4

3767.84648

In [203]:
peptide.Pep('[YPSKPDNPGEDAPAEDMARYYSALRHYINLITRQRY+6H]6+', end_h20='NH3').pep_mass

4275.124860000001

In [205]:
712.52139 * 6

4275.12834

In [206]:
peptide.Pep('[KWKLFKKIEKVGQNIRDGIIKAGPAVAVVGQATQIAK+6H]6+', end_h20='NH3').pep_mass

4007.4216300000007

In [207]:
667.90419 * 6

4007.42514

In [220]:
peptide.Pep('[YPSKPDNPGEDAPAEDMARYYSALRHYINLITRQRY-NH2+6H]6+', end_h20='NH3').pep_mass

4526.2267

In [217]:
4526.2267 - amino_acid_masses['R'] - 17.02655 - amino_acid_masses['Y']

4190.035710000001

In [219]:
4526.2267 - amino_acid_masses['Y'] - amino_acid_masses['S']

4276.13134

In [212]:
2138.074605 * 2

4276.14921

In [221]:
pep_seq = 'HADGSFSDEMNTILDNLAARDFINWLIQTKITD'
pep = peptide.Pep(f'[{pep_seq}+{charge}H]{charge}+', end_h20=True)

In [226]:
breaks = [2, 7, 9, 10, 11, 12, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]


spec = [pep.ion_mass(f'b{i}') for i in breaks] + [pep.ion_mass(f'y{33 - i}') for i in breaks]

In [234]:
pep.ion_mass('y1') - 18.01

116.03474999999999

In [239]:
pep_seq = 'HADGSFSDEMNTILDNLAARDFINWLIQTKITD'
charge = 6
iso = 4
pep = peptide.Pep(f'[{pep_seq}+{charge}H]{charge}+', end_h20=True)

In [240]:
print(pep.pep_mass)
print(pep.ion_mass('y16'))
print(pep.ion_mass('b16'))
print(pep.ion_mass('y18') + pep.ion_mass('b15'))

3769.858630000001
1905.0228100000002
1747.7227599999999
3765.82963
