In [11]:
import pandas as pd
import numpy as np
import time
import sys

data = '../data/test_data/2022_04_22_NEG_RSL3DAAvsCtrl_test_input.csv'
D = 1.0063

def upload(file, limit=None):
    df = pd.read_csv(file, header=2, nrows=limit)
    return df

df_raw = upload(data)
# df_raw

In [12]:
def format(df, samples=None):
    col_main = ['Compound',
                'm/z',
                'Retention time (min)',
                'CCS (angstrom^2)']
    col = df.columns.tolist() #create a list of all column names
    if samples is not None:
        stop = 16 + samples
    else:
        stop = col.index([col for col in df.columns if '.1' in col][0]) #index of duplicate columns we don't need
    intensities = col[16:stop] #intensity columns we wish to keep
    
    df_keep = df[col_main + intensities].sort_values(by=["Compound"], ascending=False).reset_index(drop=True)
    return df_keep

df_keep = format(df_raw)
df_keep

Unnamed: 0,Compound,m/z,Retention time (min),CCS (angstrom^2),NEG_2022_04_22_NR_LIPIDS_HILIC_C1,NEG_2022_04_22_NR_LIPIDS_HILIC_C2,NEG_2022_04_22_NR_LIPIDS_HILIC_C3,NEG_2022_04_22_NR_LIPIDS_HILIC_C4,NEG_2022_04_22_NR_LIPIDS_HILIC_RD1,NEG_2022_04_22_NR_LIPIDS_HILIC_RD2,NEG_2022_04_22_NR_LIPIDS_HILIC_RD3,NEG_2022_04_22_NR_LIPIDS_HILIC_RD4
0,7.19_761.5814m/z,761.581361,7.191133,283.714653,14424.250831,15546.810707,15752.178958,15158.249076,12851.160954,12088.743299,12931.389489,12798.276172
1,6.52_818.5908m/z,818.590775,6.516917,292.077396,17063.204540,18668.456336,18669.307763,18162.706564,13342.795819,13256.121585,13612.094663,13407.464458
2,6.42_874.6423m/z,874.642277,6.415367,300.348589,442.313697,554.683796,537.265054,506.080994,1815.630136,1932.093286,2018.909511,2029.645906
3,6.42_868.6062m/z,868.606189,6.415367,297.533583,3317.177691,3397.710084,3773.062375,3592.995155,1770.177059,1737.749249,1679.404649,1711.646536
4,6.42_851.6450m/z,851.645026,6.415367,291.889806,843.105676,891.241749,993.072197,801.870628,2240.411263,2273.591316,2460.144990,2448.969369
...,...,...,...,...,...,...,...,...,...,...,...,...
259,0.59_789.5290m/z,789.528999,0.587117,277.651732,2192.652227,3547.463325,3694.432424,4527.774968,1850.102865,1774.144118,1307.639135,1370.721035
260,0.36_325.1838m/z,325.183838,0.364283,190.014531,14791.626474,13729.065344,14335.865633,14655.894501,17418.861267,16938.366655,17238.919242,16646.434481
261,0.36_311.1685m/z,311.168457,0.364283,184.788887,12893.758098,11939.044145,12920.701816,12978.350834,15621.259825,15291.593195,15480.599859,15192.816339
262,0.36_297.1522m/z,297.152201,0.364283,179.490923,6140.965064,5796.626798,6345.268586,6328.472748,7759.185308,7681.380859,7703.513028,7763.436221


In [55]:
def pick_pairs(df, a, b):
    '''Returns index pairs of compounds with same RT and CCS'''

    '''Define lists and tolerances of each column to compare with itself'''
    mz = np.array(df['m/z'])
    mz_tol = 1e-4
    rt = np.array(df['Retention time (min)'])
    rt_tol = 1e-3
    ccs = np.array(df['CCS (angstrom^2)'])
    ccs_tol = 1e-3
    D = 1.0063
    mass_adjust = D*(b - a)

    toolbar_width = 80

    # setup toolbar
    start = time.time()
    sys.stdout.write("[%s]" % ("#"))
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['

    '''Initialize a list for indexes to be held for each pair of matched values'''
    idxs = []
    '''Create nested for loop to compare i with j in each column'''
    for i in range(len(df)):
        for j in range(i+1, len(df)):

            """Define checks for each column"""
            check_rt = np.isclose(rt[i], rt[j], rt_tol)
            if check_rt == True: pass
            else: continue
            
            check_mz = np.isclose(mz[i], mz[j] + mass_adjust, mz_tol)
            if check_mz == True: pass
            else: continue

            check_ccs = np.isclose(ccs[i], ccs[j], ccs_tol)
            if check_ccs == True: pass
            else: continue

            idxs.append([i,j])

            # update the bar
            sys.stdout.write("#")
            sys.stdout.flush()

    idx_pairs = np.array(idxs)
    flat_pairs = idx_pairs.flatten().tolist()
    mass_pairs = np.array(df['m/z'].iloc[flat_pairs]).reshape(len(idx_pairs), 2)

    sys.stdout.write("]\n") # this ends the progress bar
    end = time.time()
    print(mass_pairs.shape[0], "pairs found | Run time = ", end-start)
    return idx_pairs, mass_pairs

pairs, masses = pick_pairs(df_keep, 5, 11)

[############]
11 pairs found | Run time =  1.921722173690796


array([[877.66110815, 871.62397157],
       [803.62406841, 797.58686096],
       [777.60859289, 771.57112043],
       [761.61280606, 755.57392023],
       [924.64917386, 918.61120644],
       [896.75942805, 890.72097973],
       [978.6213752 , 972.58376553],
       [808.59848298, 802.57659759],
       [976.60351144, 970.5633508 ],
       [861.63055829, 855.59319303],
       [839.64012096, 833.60224066]])

In [53]:
data2 = '../data/test_data/2022_04_22_NEG_RSL3DAAvsCtrl_CSV_pairs_test_output.csv'
true_data = pd.read_csv(data2)
# true_data[true_data.columns.tolist()[:3]]
# df_keep.groupby(['m/z']).count().max()

true_mz = np.array(true_data['m/z']).reshape(int(len(true_data)/2), 2)
# true_mz

array([[978.6213752, 972.5837655],
       [976.6035114, 970.5633508],
       [949.6497534, 943.6137902],
       [924.6491739, 918.6112064],
       [922.6331353, 916.5968354],
       [905.6895514, 899.652542 ],
       [896.759428 , 890.7209797],
       [896.6183362, 890.5814109],
       [894.6029557, 888.5667271],
       [894.6027743, 888.5646863],
       [882.6029416, 876.5672706],
       [877.6611081, 871.6239716],
       [874.6422771, 868.6061886],
       [861.6305583, 855.593193 ],
       [839.640121 , 833.6022407],
       [808.598483 , 802.5765976],
       [805.639452 , 799.6012036],
       [803.6240684, 797.586861 ],
       [777.6085929, 771.5711204],
       [761.6128061, 755.5739202]])

In [58]:
"""Test for comparing pairs"""
from collections import Counter
import unittest

# true_pairs = []
# pairs = list(pairs)

print(type(pairs), type(true_mz))

l1 = []
for pair in masses:
       for tpair in true_mz:
              res = np.any(Counter(pair)==Counter(tpair))
              if res == True: 
                     l1.append(res)
                     break
              elif tpair == true_pairs[-1]:
                     l1.append(res)
              
print(l1, len(l1))



<class 'numpy.ndarray'> <class 'numpy.ndarray'>
[] 0


  elif tpair == true_pairs[-1]:


In [16]:
# Define a class in which the tests will run
class UnitTests(unittest.TestCase):
    def test_pairs(self):
        l1 = []
        for pair in compounds:
            for tpair in true_pairs:
                res = np.any(Counter(pair)==Counter(tpair))
                if res == True: 
                    l1.append(res)
                    break
                elif tpair == true_pairs[-1]:
                    l1.append(res)
        self.assertNotIn(False, l1)

suite = unittest.TestLoader().loadTestsFromTestCase(UnitTests)
_ = unittest.TextTestRunner().run(suite)

F
FAIL: test_pairs (__main__.UnitTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_1066/3586644625.py", line 13, in test_pairs
    self.assertNotIn(False, l1)
AssertionError: False unexpectedly found in [True, True, True, True, True, True, True, True, True, True, False]

----------------------------------------------------------------------
Ran 1 test in 0.006s

FAILED (failures=1)


In [17]:
# df_pairs = df_keep.iloc[pairs.flatten()]
# df_pairs.head()

### fcn to adjust masses of each pair

In [18]:
def mass_adj(pairs, df, a, b):
    '''Adjusts masses of given dataframe and list of pairs. Pairs must be together, with higher mass first. x is the lower value, y is the higher value.'''
    
    df_pairs = df.iloc[pairs.flatten()]
    masses = np.array(df_pairs["m/z"]).reshape((len(pairs), 2))
    masses[:, 0] -= b
    masses[:, 1] -= a

    df_pairs.insert(2, "m/z_adj", masses.flatten().tolist())

    return df_pairs

df_adj = mass_adj(pairs, df_keep, 5, 11)
# df_adj

Unnamed: 0,Compound,m/z,m/z_adj,Retention time (min),CCS (angstrom^2),NEG_2022_04_22_NR_LIPIDS_HILIC_C1,NEG_2022_04_22_NR_LIPIDS_HILIC_C2,NEG_2022_04_22_NR_LIPIDS_HILIC_C3,NEG_2022_04_22_NR_LIPIDS_HILIC_C4,NEG_2022_04_22_NR_LIPIDS_HILIC_RD1,NEG_2022_04_22_NR_LIPIDS_HILIC_RD2,NEG_2022_04_22_NR_LIPIDS_HILIC_RD3,NEG_2022_04_22_NR_LIPIDS_HILIC_RD4
8,6.36_877.6611m/z,877.661108,866.661108,6.361767,296.057245,66.967545,75.642003,57.802987,63.05648,3347.700848,3608.845722,3897.165149,3857.633179
9,6.36_871.6240m/z,871.623972,866.623972,6.361767,296.088953,1188.651581,1309.529967,1423.597303,1374.711892,4122.770095,4426.576576,4661.35487,4598.345368
31,5.17_803.6241m/z,803.624068,792.624068,5.174133,277.568296,196.284934,211.591912,258.284648,229.418663,1954.563385,2574.829285,3138.934488,3615.636884
33,5.17_797.5869m/z,797.586861,792.586861,5.174133,277.603675,1363.284068,1446.836335,1476.124984,1370.151429,2345.175272,2911.441065,3487.791256,4002.39985
36,5.17_777.6086m/z,777.608593,766.608593,5.174133,271.777713,368.586328,411.356598,423.448345,415.818216,4653.351953,5758.053828,6231.472059,7014.426233
37,5.17_771.5711m/z,771.57112,766.57112,5.174133,271.81468,4104.214386,5297.566865,4641.612403,4973.75238,5940.627557,7169.464484,8052.187963,8860.849448
55,5.05_761.6128m/z,761.612806,750.612806,5.052833,270.379867,212.854183,262.412367,264.369127,240.488769,7032.271559,1996.913082,9849.981797,10082.075129
58,5.05_755.5739m/z,755.57392,750.57392,5.052833,270.418193,1393.209103,1580.141196,1534.513307,1457.987416,7927.794916,2478.878079,10840.861653,11305.790982
110,3.41_924.6492m/z,924.649174,913.649174,3.413633,305.747336,94.429612,180.516214,110.586544,105.60408,5443.407612,5738.738399,6420.318064,7375.70317
111,3.41_918.6112m/z,918.611206,913.611206,3.413633,305.776879,460.345401,614.669488,524.546775,530.194242,7340.287958,7544.256545,8326.419331,9572.160468


In [19]:
from lipydomics.data import Dataset
from lipydomics.identification import add_feature_ids
# from tempfile import TemporaryFile


def lipid_id(input, output_name): 
    """identifies mass adjusted lipids and exports .xlsx to path specified"""
    # full = pd.read_csv(input) # this line isn't necessary if we feed in a dataframe
    trim = input.drop(columns=['Compound', 'm/z'])
    trim.to_csv('../data/trim.csv', index=False)
    data = open('../data/trim.csv')
    # need to change this to save to a temp directory
    dset = Dataset(data, esi_mode='neg')
    mz_tol = 0.03
    rt_tol = 0.3
    ccs_tol = 3.0
    tol = [mz_tol, rt_tol, ccs_tol]
    add_feature_ids(dset, tol, level='any')
    dset.export_xlsx(output_name)
    print('Identification Complete!')
    return output_name

# lipid_id(df_adj, "output")
input = df_adj
trim = input.drop(columns=['Compound', 'm/z'])
trim.to_csv('../data/trim.csv', index=False)
data = open('../data/trim.csv')
dset = Dataset(data, esi_mode='neg')
tol = [.03, .3, 3]
add_feature_ids(dset, tol, level='any')
print(dset)
dset.export_xlsx("ouptut.xlsx")

Dataset(
	csv="<_io.TextIOWrapper name='../data/trim.csv' mode='r' encoding='UTF-8'>",
	esi_mode="neg",
	samples=8,
	features=22,
	identified=22,
	normalized=False,
	rt_calibrated=False,
	ext_var=False,
	group_indices=None,
	stats={}
)


ModuleNotFoundError: No module named 'xlsxwriter'