In [2]:
#import packages internal and external
import tests
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from importlib import reload
import pandas as pd
import datasetBuilder
import tools
import scipy
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
from sklearn.ensemble import GradientBoostingClassifier as gbc
from sklearn.ensemble import RandomForestClassifier as rfc
import sklearn.base
import pickle
import copy
import tests
from sklearn.metrics import roc_auc_score as auc
import os

In [None]:
#Set path variables where we want to save created datasets, intermediate outputs, etc
#If you want to run this on your own computer, obviously you should update the path
outputs_path='/Users/jonahpoczobutt/projects/specsim_res/figOutputs'

#inputs path is where the raw msp files that we want to read in reside
#If you want to run this on your own computer, obviously you should update the path
#this must contain MSPS x,y and z
inputs_path = '/Users/jonahpoczobutt/projects/specsim_res/figOutputs'

#This variable toggles whether we do a full run of the notebook, or if we read in variables created in a previous run
fullRun=True

#create directories for results
if fullRun:
    
    os.mkdir(f'{outputs_path}/fig1')
    os.mkdir(f'{outputs_path}/fig1a')
    os.mkdir(f'{outputs_path}/fig1b')

    os.mkdir(f'{outputs_path}/fig2')


Preprocessing: Creating Target and Matches DFs


In [None]:
if fullRun:

    #This should be replaced with a function to read in all the databases
    target = datasetBuilder.get_target_df('/Users/jonahpoczobutt/projects/raw_data/nist_out.MSP').iloc[:,1:]
    all_bases = list(set(target['inchi_base']))

    #we will save the full target dataset as well as first and second halves
    target.to_csv(f'{outputs_path}/target_full.csv')

    first_bases = all_bases[:int(len(all_bases)/2)]
    second_bases = all_bases[int(len(all_bases)/2):]

    first_target = target[np.isin(target['inchi_base'],first_bases)]
    first_target.reset_index(inplace=True)
    first_target.to_csv(f'{outputs_path}/first_target.csv')
    del(first_target)

    second_target = target[np.isin(target['inchi_base'],second_bases)]
    second_target.reset_index(inplace=True)
    second_target.to_csv(f'{outputs_path}/second_target.csv')
    del(second_target)
    del(target)
    
    np.save(f'{outputs_path}/first_bases.npy',first_bases)
    np.save(f'{outputs_path}/second_bases.npy',second_bases)
    del(first_bases)
    del(second_bases)


Figure 1a: Global Performance of Individual Metrics/Weighting Schemes

In [None]:
#these are the ppm windows that we want to test
ppm_windows = []

#this is the size of the sample we take from the full target
size=1e6

#this is the maximum number of matches we allow for each query, based on the precursor window
max_matches=100

#Similarity methods and transformation parameters below. Leave sim methods as None to run all
noise_threshes=[0.01,0.05,0.1]
centroid_tolerance_vals = [0.05,3]
centroid_tolerance_types=['da','ppm']
powers=[0.25,1,3,'ent',None]
ppm_threshes = [3,5,10,15]
sim_methods=None

if fullRun:
    #we will evaluate the performace of the individual metrics on a large sample from the
    #full target dataset. You can set the size below

    #reload target
    target=np.load(f'{outputs_path}/target_full.csv')

    #comparison on large sample
    tests.create_variable_comparisons(target,
                            size=size,
                            ppm_threshes=ppm_windows,
                            noise_threshes=noise_threshes,
                            centroid_threshes=centroid_tolerance_vals,
                            centroid_types=centroid_tolerance_types,
                            powers=powers,
                            sim_methods=sim_methods,
                            max_matches=max_matches
                            outfolder = f'{outputs_path}/fig1/fig1a'
                            )

Figure 1b: Assessing Metric Stability in Smaller Samples

In [None]:
if fullRun:
    #we will evaluate the performace of the individual metrics on a large sample from the
    #full target dataset. You can set the size below

    #reload target
    target=np.load(f'{outputs_path}/target_full.csv')

    #comparison on small sample
    tests.create_variable_comparisons(target,
                            size=size,
                            ppm_threshes=ppm_windows,
                            noise_threshes=noise_threshes,
                            centroid_threshes=centroid_tolerance_vals,
                            centroid_types=centroid_tolerance_types,
                            powers=powers,
                            sim_methods=sim_methods,
                            max_matches=max_matches
                            outfolder = f'{outputs_path}/fig1/fig1b'
                            )

In [10]:
#figure 1
#column 3 is noise clip, peak consolidation value and type, weighting
#none weighting is the original weighted entropy scheme from the paper
ppm3 = pd.read_csv('/Users/jonahpoczobutt/projects/specsim_res/individual_sims/matches_3_ppm.csv', header=None)
ppm3.sort_values(by=2, ascending=False, inplace=True)

ppm5 = pd.read_csv('/Users/jonahpoczobutt/projects/specsim_res/individual_sims/matches_5_ppm.csv', header=None)
ppm5.sort_values(by=2, ascending=False, inplace=True)

ppm10 = pd.read_csv('/Users/jonahpoczobutt/projects/specsim_res/individual_sims/matches_10_ppm.csv', header=None)
ppm10.sort_values(by=2, ascending=False, inplace=True)

ppm15 = pd.read_csv('/Users/jonahpoczobutt/projects/specsim_res/individual_sims/matches_15_ppm.csv', header=None)
ppm15.sort_values(by=2, ascending=False, inplace=True)

In [9]:
ppm3[:20]

Unnamed: 0,0,1,2,3
881,17,lorentzian,0.725874,0.01_3_ppm_None
497,17,lorentzian,0.721003,0.01_3_ppm_0.25
401,17,lorentzian,0.716741,0.01_0.05_da_None
593,17,lorentzian,0.716667,0.01_3_ppm_1
1831,16,lorentzian,0.715604,0.05_3_ppm_None
1451,16,lorentzian,0.713436,0.05_3_ppm_0.25
457,73,max_entropy,0.710486,0.01_0.05_da_None
1546,16,lorentzian,0.71027,0.05_3_ppm_1
388,4,max_entropy_jonah,0.709743,0.01_0.05_da_None
387,3,max_bhattacharya_2,0.70965,0.01_0.05_da_None


In [11]:
ppm5[:20]

Unnamed: 0,0,1,2,3
880,17,lorentzian,0.741499,0.01_3_ppm_None
492,17,lorentzian,0.736347,0.01_3_ppm_0.25
589,17,lorentzian,0.73359,0.01_3_ppm_1
1836,17,lorentzian,0.732658,0.05_3_ppm_None
1452,17,lorentzian,0.729282,0.05_3_ppm_0.25
1548,17,lorentzian,0.726975,0.05_3_ppm_1
1740,17,lorentzian,0.723948,0.05_3_ppm_ent
2791,17,lorentzian,0.722759,0.1_3_ppm_None
771,5,chi2,0.721344,0.01_3_ppm_ent
396,16,lorentzian,0.721095,0.01_0.05_da_None


In [12]:
ppm10[:20]

Unnamed: 0,0,1,2,3
886,17,lorentzian,0.738087,0.01_3_ppm_None
502,17,lorentzian,0.734555,0.01_3_ppm_0.25
598,17,lorentzian,0.729483,0.01_3_ppm_1
405,17,lorentzian,0.72665,0.01_0.05_da_None
1846,17,lorentzian,0.726006,0.05_3_ppm_None
1462,17,lorentzian,0.723874,0.05_3_ppm_0.25
916,47,motyka,0.721896,0.01_3_ppm_None
902,33,ruzicka,0.721896,0.01_3_ppm_None
947,78,intersection,0.721896,0.01_3_ppm_None
937,68,braycurtis,0.721896,0.01_3_ppm_None


In [13]:
ppm15[:20]

Unnamed: 0,0,1,2,3
885,17,lorentzian,0.754865,0.01_3_ppm_None
497,17,lorentzian,0.751154,0.01_3_ppm_0.25
401,17,lorentzian,0.745204,0.01_0.05_da_None
594,17,lorentzian,0.743385,0.01_3_ppm_1
1845,17,lorentzian,0.740044,0.05_3_ppm_None
1457,17,lorentzian,0.737968,0.05_3_ppm_0.25
964,96,entropy,0.736466,0.01_3_ppm_None
1361,16,lorentzian,0.73624,0.05_0.05_da_None
776,5,chi2,0.736063,0.01_3_ppm_ent
921,53,bhattacharya_2,0.736061,0.01_3_ppm_None
