In [1]:
data = '../data/2022_07_13_WTvsKO_allcompounds_neg.csv'

a = 5
b = 11

D = 1.0063

m1 = a*D
m2 = b*D

In [2]:
import pandas as pd
import numpy as np
import time

### fcn to upload data

In [3]:
def upload(file):
    df = pd.read_csv(file, header=2) #header=2 cuts off first two rows
    return df

df_raw = upload(data)

- sort df at beginning by m/z
- define m1 and m2 at beginning; applicable to pair picking as well as mass adjustment
- m1 = a*1.0063, m2 = b*1.0063
- input for number of compounds being analyzed, rather than finding the '.1' values

### fcn to format data

In [4]:
def format(df):
    col = df.columns.tolist() #create a list of all column names
    main = [0,2,4,5] #Index of main columns we wish to keep and compare
    stop = col.index([col for col in df.columns if '.1' in col][0]) #index of duplicate columns we don't need
    intensities = col[16:stop] #intensity columns we wish to keep
    col_main = [] #Column names of kept columns
    for i in main:
        col_main.append(col[i])
    '''Create new filtered dataframe of important columns'''
    df_keep = df[col_main]# + intensities] #.sort_values(["m/z"], ascending=False)
    return df_keep

df_keep = format(df_raw)[:200]

df_keep.to_csv('formatted_data.csv',index=False)

In [None]:
pd.read_csv('formatted_data.csv')

### fcn to pick out pairs

In [5]:
def pick_pairs(df):
    '''Returns index pairs of compounds with same RT and CCS'''

    '''Define lists and tolerances of each column to compare with itself'''
    mz = df['m/z'].tolist()
    mz_tol = 1e-4
    rt = df['Retention time (min)'].tolist()
    rt_tol = 1e-3
    ccs = df['CCS (angstrom^2)'].tolist()
    ccs_tol = 1e-3

    # '''Define lists and tolerances of each column to compare with itself'''
    # mz = np.array(df['m/z'])
    # mz_tol = 1e-4
    # rt = np.array(df['Retention time (min)'])
    # rt_tol = 1e-3
    # ccs = np.array(df['CCS (angstrom^2)'])
    # ccs_tol = 1e-3

    # '''Define lists and tolerances of each column to compare with itself'''
    # mz = df['m/z']
    # mz_tol = 1e-4
    # rt = df['Retention time (min)']
    # rt_tol = 1e-3
    # ccs = df['CCS (angstrom^2)']
    # ccs_tol = 1e-3

    '''Initialize a list for indexes to be held for each pair of matched values'''
    idxs = []
    '''Create nested for loop to compare i with j in each column'''
    for i in range(len(df)):
        for j in range(len(df)):
            '''Define checks for each column'''
            check_mz = np.isclose(mz[i], mz[j] + (m2 - m1), mz_tol)        
            check_rt = np.isclose(rt[i], rt[j], rt_tol)
            check_ccs = np.isclose(ccs[i], ccs[j], ccs_tol)
            
            '''Record results of each check (True/False)'''
            checks = [check_mz, check_rt, check_ccs]
            
            '''If all checks are true, append to list of pairs'''
            if all(checks) and i!=j:
                idxs.append([i,j])
                pairs = np.array(idxs)
            else:
                pass


    return pairs

In [6]:
start = time.time()
pairs = pick_pairs(df_keep)
end = time.time()
run_time = end-start
print("Run time = ", run_time)
pairs

Run time =  5.791602611541748


array([[  0, 188],
       [ 14,  45],
       [ 16,  15],
       [ 20,  18],
       [ 25, 150],
       [ 28,  24],
       [ 35, 104],
       [ 36,  24],
       [ 38,  26],
       [ 41, 135],
       [ 43, 149],
       [ 44,  10],
       [ 54,  75],
       [ 55, 101],
       [ 56,  30],
       [ 57, 172],
       [ 70,   4],
       [ 88, 157],
       [ 92,  90],
       [105,  64],
       [121,  48],
       [138,  67],
       [164, 129],
       [167, 180],
       [191, 198],
       [199, 136]])

### Show dataframe of pairs

In [11]:
test_pairs = np.array([
            [  0, 188],
            [ 14,  45],
            [ 16,  15],
            [ 20,  18],
            [ 25, 150],
            [ 28,  24],
            [ 35, 104],
            [ 36,  24],
            [ 38,  26],
            [ 41, 135],
            [ 43, 149],
            [ 44,  10],
            [ 54,  75],
            [ 55, 101],
            [ 56,  30],
            [ 57, 172],
            [ 70,   4],
            [ 88, 157],
            [ 92,  90],
            [105,  64],
            [121,  48],
            [138,  67],
            [164, 129],
            [167, 180],
            # [191, 198],
            [199, 136]])

# np.testing.assert_array_equal([[5,2],[2,3]], [[5,2],[2,2]])
np.testing.assert_array_equal(pairs, test_pairs)

AssertionError: 
Arrays are not equal

(shapes (26, 2), (25, 2) mismatch)
 x: array([[  0, 188],
       [ 14,  45],
       [ 16,  15],...
 y: array([[  0, 188],
       [ 14,  45],
       [ 16,  15],...

In [None]:
df_pairs = df_keep.iloc[pairs.flatten()]
df_pairs.head()


### fcn to adjust masses of each pair

In [None]:
def mass_adj(pairs, df, x, y):
    '''Adjusts masses of given dataframe and list of pairs. Pairs must be together, with higher mass first. x is the lower value, y is the higher value.'''
    
    df_pairs = df.iloc[pairs.flatten()]
    masses = np.array(df_pairs["m/z"]).reshape((len(pairs), 2))
    masses[:, 0] -= y
    masses[:, 1] -= x

    df_pairs.insert(2, "m/z_adj", masses.flatten().tolist())

    return df_pairs

df_adj = mass_adj(pairs, df_keep, m1, m2)
df_adj