In [35]:
data = '../data/2022_07_13_WTvsKO_allcompounds_neg.csv'

a = 5
b = 11

D = 1.0063

m1 = a*D
m2 = b*D

In [36]:
import pandas as pd
import numpy as np
import time

### fcn to upload data

In [37]:
def upload(file):
    df = pd.read_csv(file, header=2) #header=2 cuts off first two rows
    return df

df_raw = upload(data)

- sort df at beginning by m/z
- define m1 and m2 at beginning; applicable to pair picking as well as mass adjustment
- m1 = a*1.0063, m2 = b*1.0063
- input for number of compounds being analyzed, rather than finding the '.1' values

### fcn to format data

In [38]:
def format(df):
    col = df.columns.tolist() #create a list of all column names
    main = [0,2,4,5] #Index of main columns we wish to keep and compare
    stop = col.index([col for col in df.columns if '.1' in col][0]) #index of duplicate columns we don't need
    intensities = col[16:stop] #intensity columns we wish to keep
    col_main = [] #Column names of kept columns
    for i in main:
        col_main.append(col[i])
    '''Create new filtered dataframe of important columns'''
    df_keep = df[col_main]# + intensities] #.sort_values(["m/z"], ascending=False)
    return df_keep

df_keep = format(df_raw)[:200]

df_keep

Unnamed: 0,Compound,m/z,Retention time (min),CCS (angstrom^2)
0,5.09_803.7568m/z,803.756835,5.086917,284.123327
1,4.97_787.7598m/z,787.759812,4.965617,281.068778
2,5.05_757.2737m/z,757.273727,5.053083,278.108306
3,6.29_1020.8231m/z,1020.823084,6.294300,323.846316
4,5.09_776.6017n,797.588528,5.086917,284.160322
...,...,...,...,...
195,6.42_789.6441m/z,789.644147,6.415600,274.751741
196,5.02_715.5528n,750.543916,5.019233,278.152776
197,4.35_707.0502n,744.033132,4.345000,211.722671
198,1.14_782.5737m/z,782.573677,1.140067,271.639779


### fcn to pick out pairs

In [39]:
def pick_pairs(df):
    '''Returns index pairs of compounds with same RT and CCS'''

    '''Define lists and tolerances of each column to compare with itself'''
    mz = df['m/z'].tolist()
    mz_tol = 1e-4
    rt = df['Retention time (min)'].tolist()
    rt_tol = 1e-3
    ccs = df['CCS (angstrom^2)'].tolist()
    ccs_tol = 1e-3

    # '''Define lists and tolerances of each column to compare with itself'''
    # mz = np.array(df['m/z'])
    # mz_tol = 1e-4
    # rt = np.array(df['Retention time (min)'])
    # rt_tol = 1e-3
    # ccs = np.array(df['CCS (angstrom^2)'])
    # ccs_tol = 1e-3

    # '''Define lists and tolerances of each column to compare with itself'''
    # mz = df['m/z']
    # mz_tol = 1e-4
    # rt = df['Retention time (min)']
    # rt_tol = 1e-3
    # ccs = df['CCS (angstrom^2)']
    # ccs_tol = 1e-3

    '''Initialize a list for indexes to be held for each pair of matched values'''
    idxs = []
    '''Create nested for loop to compare i with j in each column'''
    for i in range(len(df)):
        for j in range(len(df)):
            '''Define checks for each column'''
            check_mz = np.isclose(mz[i], mz[j] + (m2 - m1), mz_tol)        
            check_rt = np.isclose(rt[i], rt[j], rt_tol)
            check_ccs = np.isclose(ccs[i], ccs[j], ccs_tol)
            
            '''Record results of each check (True/False)'''
            checks = [check_mz, check_rt, check_ccs]
            
            '''If all checks are true, append to list of pairs'''
            if all(checks) and i!=j:
                idxs.append([i,j])
                pairs = np.array(idxs)
            else:
                pass


    return pairs

In [40]:
start = time.time()
pairs = pick_pairs(df_keep)
end = time.time()
run_time = end-start
print("Run time = ", run_time)
pairs

Run time =  6.811499834060669


array([[  0, 188],
       [ 14,  45],
       [ 16,  15],
       [ 20,  18],
       [ 25, 150],
       [ 28,  24],
       [ 35, 104],
       [ 36,  24],
       [ 38,  26],
       [ 41, 135],
       [ 43, 149],
       [ 44,  10],
       [ 54,  75],
       [ 55, 101],
       [ 56,  30],
       [ 57, 172],
       [ 70,   4],
       [ 88, 157],
       [ 92,  90],
       [105,  64],
       [121,  48],
       [138,  67],
       [164, 129],
       [167, 180],
       [191, 198],
       [199, 136]])

### Show dataframe of pairs

In [41]:
df_pairs = df_keep.iloc[pairs.flatten()]
df_pairs.head()

Unnamed: 0,Compound,m/z,Retention time (min),CCS (angstrom^2)
0,5.09_803.7568m/z,803.756835,5.086917,284.123327
188,5.09_776.7309n,797.719358,5.086917,284.159531
14,5.05_761.6121m/z,761.61205,5.053083,278.080052
45,5.05_756.6069n,755.566165,5.053083,278.119515
16,3.16_899.8980m/z,899.89798,3.15735,302.465024


### fcn to adjust masses of each pair

In [42]:
def mass_adj(pairs, df, x, y):
    '''Adjusts masses of given dataframe and list of pairs. Pairs must be together, with higher mass first. x is the lower value, y is the higher value.'''
    
    df_pairs = df.iloc[pairs.flatten()]
    masses = np.array(df_pairs["m/z"]).reshape((len(pairs), 2))
    masses[:, 0] -= y
    masses[:, 1] -= x

    df_pairs.insert(2, "m/z_adj", masses.flatten().tolist())

    return df_pairs

df_adj = mass_adj(pairs, df_keep, m1, m2)
df_adj

Unnamed: 0,Compound,m/z,m/z_adj,Retention time (min),CCS (angstrom^2)
0,5.09_803.7568m/z,803.756835,792.687535,5.086917,284.123327
188,5.09_776.7309n,797.719358,792.687858,5.086917,284.159531
14,5.05_761.6121m/z,761.61205,750.54275,5.053083,278.080052
45,5.05_756.6069n,755.566165,750.534665,5.053083,278.119515
16,3.16_899.8980m/z,899.89798,888.82868,3.15735,302.465024
15,3.16_893.9342m/z,893.934249,888.902749,3.15735,302.495477
20,5.05_342.4169m/z,342.416944,331.347644,5.053083,190.202882
18,5.05_336.3790m/z,336.378983,331.347483,5.053083,190.331918
25,0.93_896.2917m/z,896.29166,885.22236,0.9313,293.059637
150,0.93_890.2471m/z,890.247072,885.215572,0.9313,293.089785
