In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from importlib import reload
import os
from collections import Counter
# import warnings
# warnings.filterwarnings("ignore")

import TunaSims
import func_ob
import tools
import datasetBuilder
import testUtils

Results for Different Ways of Distributing Interspectral Intensity Difference

In [None]:
xs=list()
scores_1=list()
scores_2=list()

total_difference = 0.9
len_difference = 10
max_len = 25

func1 = partial(TunaSims.tuna_dif_distance,e=1,f=-1,h=500, i=-3,j=2,k=-800)
func2 = partial(TunaSims.tuna_dif_distance,e=1,f=-1,h=500,i=-3,j=2,k=-800)

normalize = False

for i in range(1,max_len):

    xs.append(i)
    dif_1 = np.array([1/(x+1) for x in range(i)])
    dif_1 = dif_1/sum(dif_1)*total_difference

    dif_2 = np.array([total_difference/i for x in range(i)])

    if normalize:
        scores_1.append(1- 1/func1(dif_1))
        scores_2.append(1 - 1/func2(dif_2))
    else:
        scores_1.append(func1(dif_1, np.zeros(len(dif_1))))
        scores_2.append(func2(dif_2, np.zeros(len(dif_2))))

plt.plot(xs, scores_1, label='descending')
plt.plot(xs, scores_2, label='unfiorm')

plt.legend()
plt.show()

In [None]:
#databases
outputs_path='/Users/jonahpoczobutt/projects/TunaRes/testy'
nist14='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist14_highres.pkl'
nist20_prot_deprot = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist20_prot_deprot.pkl'
nist23_hr_prot_deprot_only = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_prot_deprot_only.pkl'
nist23_hr_full ='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_full.pkl'
gnps='/Users/jonahpoczobutt/projects/raw_data/db_csvs/gnps_highres.pkl'
mona='/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_highres.pkl'
metlin='/Users/jonahpoczobutt/projects/raw_data/db_csvs/metlin_highres_inst.pkl'
mona_nist = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_nist_prot_only.pkl'

self_search=False
query = metlin
target = nist23_hr_full
if self_search:
    target=query
    
fullRun=True
if fullRun:
    os.mkdir(outputs_path)
    os.mkdir(f'{outputs_path}/intermediateOutputs')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches')

In [None]:
fullRun=True
if fullRun:

    #This should be replaced with a function to read in all the databases
    query_ = pd.read_pickle(query)
    all_bases = list(set(query_['inchi_base']))

    if self_search:
        query_.insert(0,'queryID', [i for i in range(len(query_))])
    else:
        query_.insert(0,'queryID', ["_" for i in range(len(query_))])

    #this method is in place
    np.random.shuffle(all_bases)

    first_bases = all_bases[:int(len(all_bases)*0.5)]
    second_bases = all_bases[int(len(all_bases)*0.5):int(len(all_bases)*0.7)]
    third_bases = all_bases[int(len(all_bases)*0.7):]

    first_query_ = query_[np.isin(query_['inchi_base'],first_bases)]
    first_query_.reset_index(inplace=True)
    first_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
    del(first_query_)

    second_query_ = query_[np.isin(query_['inchi_base'],second_bases)]
    second_query_.reset_index(inplace=True)
    second_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/second_query.pkl')
    del(second_query_)

    third_query_ = query_[np.isin(query_['inchi_base'],third_bases)]
    third_query_.reset_index(inplace=True)
    third_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
    del(third_query_)
    del(query_)

    
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/first_bases.npy',first_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/second_bases.npy',second_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/third_bases.npy',third_bases)
    del(first_bases)
    del(second_bases)
    del(third_bases)
    del(all_bases)


In [None]:
#Similarity methods and transformation parameters below. Leave sim methods as None to run all
reload(datasetBuilder)
reload(tools)

ppm_windows = [10]

noise_threshes=[0.01]
centroid_tolerance_vals = [0.05]
centroid_tolerance_types=['da']
powers=['orig']
sim_methods=['lorentzian','entropy','chi2','fidelity','dot_product','proportional_entropy']
prec_removes=[True]


train_size=3e6
val_size=1e6
test_size=2e6

max_matches=None
adduct_match = False

target_=pd.read_pickle(target)

if self_search:
    target_.insert(0,'queryID', [i for i in range(len(target_))])
else:
    target_.insert(0,'queryID', ["*" for i in range(len(target_))])

for i in ppm_windows:

    #read in first bases and shuffle order
    query_train = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
    query_train=query_train.sample(frac=1)

    #create matches for model to train on
    matches = datasetBuilder.create_matches_df(query_train,target_,i,max_matches,train_size, adduct_match)
    matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_matches_{i}_ppm.pkl')
    del(query_train)

    
    cleaned = datasetBuilder.create_cleaned_df(
                                        matches, 
                                        sim_methods, 
                                        noise_threshes, 
                                        centroid_tolerance_vals, 
                                        centroid_tolerance_types,
                                        powers,
                                        prec_removes
    )

    cleaned.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_{i}_ppm.pkl')

    sub_train=cleaned.iloc[:,:2]
    sub_train.columns=['query','target']
    sub_train['match']=cleaned['match']

    #read in first bases and shuffle order
    query_test = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
    query_test=query_test.sample(frac=1)

    #create matches for model to train on
    matches = datasetBuilder.create_matches_df(query_test,target_,i,max_matches,test_size, adduct_match)
    matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_matches_{i}_ppm.pkl')
    del(query_test)

    
    cleaned = datasetBuilder.create_cleaned_df(
                                        matches, 
                                        sim_methods, 
                                        noise_threshes, 
                                        centroid_tolerance_vals, 
                                        centroid_tolerance_types,
                                        powers,
                                        prec_removes
    )

    cleaned.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_{i}_ppm.pkl')

    sub_test=cleaned.iloc[:,:2]
    sub_test.columns=['query','target']
    sub_test['match']=cleaned['match']





Func Specs

In [None]:
reload(func_ob)
reload(TunaSims)
#helper lambda funcs
squared_loss = lambda x: x**2
lin_loss = lambda x: x
l1_reg = lambda l,x: l*sum(np.abs(x))
l2_reg = lambda l,x: l*sum(x**2)
no_reg = lambda x: 0

params = {'all':[i for i in 'abcdefghijklmno'],
          'tot_dis':['a','b'],
          'ind_dis':['e','f'],
          'tot+ind':['a','b','e','f'],
          'tot+ind_int':['a','b','e','f','m'],
          'tot+ind+len':['a','b','e','f','i','j','m','n','o'],
          'tot_dis_k':['a','b','c','d'],
          'ind_dis_k':['e','f','g','h'],
          'tot+ind_k':['a','b','c','d','e','f','g','h'],
          'tot+ind_int_k':['a','b','c','d','e','f','g','h','i','j','k','l'],
          }

reg_funcs = [partial(l1_reg,1),partial(l1_reg,.1),partial(l2_reg,1),partial(l2_reg,.1),no_reg]
reg_names = ['l1_1','l1_0.1','l2_1','l2_0.1','none']
losses = [squared_loss, lin_loss]
loss_names = ['squared','l1']
momentums = [None,'simple','jonie']
mom_weights = [[0.8,0.2],[0.2,0.8]]
lambdas = [0.01,0.1,1]
max_iters = [1e3,1e4,1e5]

funcs=testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                       reg_names=reg_names,
                                       losses=losses,
                                       loss_names=loss_names,
                                       momentums=momentums,
                                       mom_weights=mom_weights,
                                       lambdas=lambdas,
                                       max_iters=max_iters)

print(f'number of specifications: {len(funcs)}')

trained = list()
sub_train = sub_train.sample(frac=1)

for i in range(len(funcs)):
    
    funcs[i].fit(sub_train)
    trained.append(funcs[i])
    print(i)

Get Train Errors

In [None]:
comparison_metrics = ['entropy',
             'proportional_entropy',
             'lorentzian',
             'dot_product',
             'fidelity',
             'proportional_manhattan',
             'max_fidelity',
             'matusita',
             'proportional_lorentzian',
             'chi2',
             'laplacian',
             'max_laplacian',
             'harmonic_mean',
             'bhattacharya_1',
             'squared_chord',
             'cross_ent'
             ]

small = testUtils.trained_res_to_df(trained[:len(trained)/3],sub_train.iloc[:max_iters[0]])
small_metrics = testUtils.orig(comparison_metrics,sub_train.iloc[:max_iters[0]])

medium = testUtils.trained_res_to_df(trained[:len(trained)/3],sub_train.iloc[:max_iters[1]])
medium_metrics = testUtils.orig(comparison_metrics,sub_train.iloc[:max_iters[1]])

large = testUtils.trained_res_to_df(trained[:len(trained)/3],sub_train.iloc[:max_iters[2]])
large_metrics = testUtils.orig(comparison_metrics,sub_train.iloc[:max_iters[2]])

In [None]:
small_test = testUtils.trained_res_to_df(trained[:len(trained)/3],sub_test.iloc[max_iters[2]:])
small_test_metrics = testUtils.orig(comparison_metrics,sub_test.iloc[max_iters[2]:])

medium_test = testUtils.trained_res_to_df(trained[:len(trained)/3],sub_test.iloc[max_iters[2]:])
medium_test_metrics = testUtils.orig(comparison_metrics,sub_test.iloc[max_iters[2]:])

large_test = testUtils.trained_res_to_df(trained[:len(trained)/3],sub_test.iloc[max_iters[2]:])
large_test_metrics = testUtils.orig(comparison_metrics,sub_test.iloc[max_iters[2]:])

Conclusions: 

num of params not appearing to change train time much

consider replacing knockouts with sigmoids

consider tuning final sigmoid

Other Ideas:

Accuracy (In order of increasing difficulty):

-Incorporate as feature how many possible chem structures (can also restrict to NPS) exist within a certain precursor distance. (violating golden rules or not)

-include original NIST version or theoretical res as feature

-Weight different ranges of spec differently for matches (more diversity/greater accuracy)

-smush together top n results over different inchicores and come up with combined model predicting over individual inchicores

-diagnostic ion/loss classing as a feature...do they match

-kernelized smooth match

-3d struct guesses...do they match (cores, but can generalize to 3d)

Speed(In order of increasing difficulty):

-combine sim metrics and expand(apply func to df)

-exclude matches based on non-similarity features to cut down on needed comparisons

-ion tables to upper bound similarity

-only use one peak consolidation and matching protocol...then only do reweight transformations on already matched peaks for spec and sim features