In [8]:
import pandas as pd
import numpy as np
import time
import sys

data = '../data/test_data/2022_04_22_NEG_RSL3DAAvsCtrl_test_input.csv'
D = 1.0063

def upload(file, limit=None):
    df = pd.read_csv(file, header=2, nrows=limit)
    return df

df_raw = upload(data)
# df_raw

In [9]:
def format(df, samples=None):
    col_main = ['Compound',
                'm/z',
                'Retention time (min)',
                'CCS (angstrom^2)']
    col = df.columns.tolist() #create a list of all column names
    if samples is not None:
        stop = 16 + samples
    else:
        stop = col.index([col for col in df.columns if '.1' in col][0]) #index of duplicate columns we don't need
    intensities = col[16:stop] #intensity columns we wish to keep
    
    df_keep = df[col_main + intensities].sort_values(by=["Compound"], ascending=False).reset_index(drop=True)
    return df_keep

df_keep = format(df_raw)
df_keep

Unnamed: 0,Compound,m/z,Retention time (min),CCS (angstrom^2),NEG_2022_04_22_NR_LIPIDS_HILIC_C1,NEG_2022_04_22_NR_LIPIDS_HILIC_C2,NEG_2022_04_22_NR_LIPIDS_HILIC_C3,NEG_2022_04_22_NR_LIPIDS_HILIC_C4,NEG_2022_04_22_NR_LIPIDS_HILIC_RD1,NEG_2022_04_22_NR_LIPIDS_HILIC_RD2,NEG_2022_04_22_NR_LIPIDS_HILIC_RD3,NEG_2022_04_22_NR_LIPIDS_HILIC_RD4
0,7.19_761.5814m/z,761.581361,7.191133,283.714653,14424.250831,15546.810707,15752.178958,15158.249076,12851.160954,12088.743299,12931.389489,12798.276172
1,6.52_818.5908m/z,818.590775,6.516917,292.077396,17063.204540,18668.456336,18669.307763,18162.706564,13342.795819,13256.121585,13612.094663,13407.464458
2,6.42_874.6423m/z,874.642277,6.415367,300.348589,442.313697,554.683796,537.265054,506.080994,1815.630136,1932.093286,2018.909511,2029.645906
3,6.42_868.6062m/z,868.606189,6.415367,297.533583,3317.177691,3397.710084,3773.062375,3592.995155,1770.177059,1737.749249,1679.404649,1711.646536
4,6.42_851.6450m/z,851.645026,6.415367,291.889806,843.105676,891.241749,993.072197,801.870628,2240.411263,2273.591316,2460.144990,2448.969369
...,...,...,...,...,...,...,...,...,...,...,...,...
259,0.59_789.5290m/z,789.528999,0.587117,277.651732,2192.652227,3547.463325,3694.432424,4527.774968,1850.102865,1774.144118,1307.639135,1370.721035
260,0.36_325.1838m/z,325.183838,0.364283,190.014531,14791.626474,13729.065344,14335.865633,14655.894501,17418.861267,16938.366655,17238.919242,16646.434481
261,0.36_311.1685m/z,311.168457,0.364283,184.788887,12893.758098,11939.044145,12920.701816,12978.350834,15621.259825,15291.593195,15480.599859,15192.816339
262,0.36_297.1522m/z,297.152201,0.364283,179.490923,6140.965064,5796.626798,6345.268586,6328.472748,7759.185308,7681.380859,7703.513028,7763.436221


In [10]:
def pick_pairs(df, a, b):
    '''Returns index pairs of compounds with same RT and CCS'''

    '''Define lists and tolerances of each column to compare with itself'''
    mz = np.array(df['m/z'])
    mz_tol = 1e-4
    rt = np.array(df['Retention time (min)'])
    rt_tol = 1e-3
    ccs = np.array(df['CCS (angstrom^2)'])
    ccs_tol = 1e-3
    D = 1.0063
    mass_adjust = D*(b - a)

    toolbar_width = 80

    # setup toolbar
    start = time.time()
    sys.stdout.write("[%s]" % ("#"))
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['

    '''Initialize a list for indexes to be held for each pair of matched values'''
    idxs = []
    '''Create nested for loop to compare i with j in each column'''
    for i in range(len(df)):
        for j in range(i+1, len(df)):

            """Define checks for each column"""
            check_rt = np.isclose(rt[i], rt[j], rt_tol)
            if check_rt == True: pass
            else: continue
            
            check_mz = np.isclose(mz[i], mz[j] + mass_adjust, mz_tol)
            if check_mz == True: pass
            else: continue

            check_ccs = np.isclose(ccs[i], ccs[j], ccs_tol)
            if check_ccs == True: pass
            else: continue

            idxs.append([i,j])

            # update the bar
            sys.stdout.write("#")
            sys.stdout.flush()

    idx_pairs = np.array(idxs)
    flat_pairs = idx_pairs.flatten().tolist()
    compound_pairs = np.array(df['Compound'].iloc[flat_pairs]).reshape(len(idx_pairs), 2)

    sys.stdout.write("]\n") # this ends the progress bar
    end = time.time()
    print(compound_pairs.shape[0], "pairs found | Run time = ", end-start)
    return compound_pairs

pairs = pick_pairs(df_keep, 5, 11)
pairs

[############]
11 pairs found | Run time =  2.2563538551330566


array([['6.36_877.6611m/z', '6.36_871.6240m/z'],
       ['5.17_803.6241m/z', '5.17_797.5869m/z'],
       ['5.17_777.6086m/z', '5.17_771.5711m/z'],
       ['5.05_761.6128m/z', '5.05_755.5739m/z'],
       ['3.41_924.6492m/z', '3.41_918.6112m/z'],
       ['3.41_896.7594m/z', '3.41_890.7210m/z'],
       ['3.38_978.6214m/z', '3.38_972.5838m/z'],
       ['1.71_808.5985m/z', '1.71_802.5766m/z'],
       ['1.02_976.6035m/z', '1.02_970.5634m/z'],
       ['0.90_861.6306m/z', '0.90_855.5932m/z'],
       ['0.90_839.6401m/z', '0.90_833.6022m/z']], dtype=object)

In [11]:
# """Finding compound name pairs from indexes"""
# # for i in pairs:
# #     print(df_keep['Compound'][i[0]], df_keep['Compound'][i[1]])

# list = pairs.flatten().tolist()
# comp = np.array(df_keep['Compound'].iloc[list]).reshape(len(pairs), 2)
# comp

In [50]:
"""Test for comparing pairs"""
from collections import Counter
import unittest

true_pairs = [['6.36_877.6611m/z', '6.36_871.6240m/z'],
       ['5.17_803.6241m/z', '5.17_797.5869m/z'],
       ['5.17_777.6086m/z', '5.17_771.5711m/z'],
       ['5.05_755.5739m/z', '5.05_761.6128m/z'], # swapped order   OG:'5.05_761.6128m/z', '5.05_755.5739m/z'
       ['3.41_924.6492m/z', '3.41_918.6112m/z'],
       ['3.41_896.7594m/z', '3.41_890.7210m/z'],
       ['0.90_861.6306m/z', '0.90_855.5932m/z'], # re-inserted here
       ['3.38_978.6214m/z', '3.38_972.5838m/z'],
       ['1.71_808.5985m/z', '1.71_802.5766m/z'],
       ['1.02_976.6035m/z', '1.02_970.5634m/z'],
       # ['0.90_861.6306m/z', '0.90_855.5932m/z'],
       ['0.90_839.6401m/z', '0.90_833.6012m/z']] 

# pairs = list(pairs)
print(type(pairs), type(true_pairs))

l1 = []
for pair in pairs:
       for tpair in true_pairs:
              res = np.any(Counter(pair)==Counter(tpair))
              if res == True: 
                     l1.append(res)
                     break
              elif tpair == true_pairs[-1]:
                     l1.append(res)
              
print(l1, len(l1))



<class 'list'> <class 'list'>
[True, True, True, True, True, True, True, True, True, True, False] 11


In [51]:
# Define a class in which the tests will run
class UnitTests(unittest.TestCase):
    def test_pairs(self):
        l1 = []
        for pair in pairs:
            for tpair in true_pairs:
                res = np.any(Counter(pair)==Counter(tpair))
                if res == True: 
                    l1.append(res)
                    break
                elif tpair == true_pairs[-1]:
                    l1.append(res)
        self.assertNotIn(False, l1)

suite = unittest.TestLoader().loadTestsFromTestCase(UnitTests)
_ = unittest.TextTestRunner().run(suite)

F
FAIL: test_pairs (__main__.UnitTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_394/3697784467.py", line 13, in test_pairs
    self.assertNotIn(False, l1)
AssertionError: False unexpectedly found in [True, True, True, True, True, True, True, True, True, True, False]

----------------------------------------------------------------------
Ran 1 test in 0.009s

FAILED (failures=1)


In [31]:
# df_pairs = df_keep.iloc[pairs.flatten()]
# df_pairs.head()

### fcn to adjust masses of each pair

In [None]:
def mass_adj(pairs, df, a, b):
    '''Adjusts masses of given dataframe and list of pairs. Pairs must be together, with higher mass first. x is the lower value, y is the higher value.'''
    
    df_pairs = df.iloc[pairs.flatten()]
    masses = np.array(df_pairs["m/z"]).reshape((len(pairs), 2))
    masses[:, 0] -= b
    masses[:, 1] -= a

    df_pairs.insert(2, "m/z_adj", masses.flatten().tolist())

    return df_pairs

df_adj = mass_adj(pairs, df_keep, 5, 11)
df_adj