In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from importlib import reload
import os
from collections import Counter
# import warnings
# warnings.filterwarnings("ignore")

import TunaSims
import func_ob
import tools
import datasetBuilder
import testUtils
import spectral_similarity

Results for Different Ways of Distributing Interspectral Intensity Difference

In [None]:
xs=list()
scores_1=list()
scores_2=list()

total_difference = 0.9
len_difference = 10
max_len = 25

func1 = partial(TunaSims.tuna_dif_distance,f=1,g=1,h=2)
func2 = partial(TunaSims.tuna_dif_distance,f=1,g=1,h=2)

normalize = False

for i in range(1,max_len):

    xs.append(i)
    dif_1 = np.array([1/(x+1) for x in range(i)])
    dif_1 = dif_1/sum(dif_1)*total_difference

    dif_2 = np.array([total_difference/i for x in range(i)])

    if normalize:
        scores_1.append(1- 1/func1(dif_1))
        scores_2.append(1 - 1/func2(dif_2))
    else:
        scores_1.append(func1(dif_1, np.zeros(len(dif_1))))
        scores_2.append(func2(dif_2, np.zeros(len(dif_2))))

plt.plot(xs, scores_1, label='descending')
plt.plot(xs, scores_2, label='unfiorm')

plt.legend()
plt.show()

In [None]:
#databases
outputs_path='/Users/jonahpoczobutt/projects/TunaRes/testy'
nist14='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist14_highres.pkl'
nist20_prot_deprot = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist20_prot_deprot.pkl'
nist23_hr_prot_deprot_only = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_prot_deprot_only.pkl'
nist23_hr_full ='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_full.pkl'
gnps='/Users/jonahpoczobutt/projects/raw_data/db_csvs/gnps_highres.pkl'
mona='/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_highres.pkl'
metlin='/Users/jonahpoczobutt/projects/raw_data/db_csvs/metlin_highres_inst.pkl'
mona_nist = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_nist_prot_only.pkl'

self_search=False
query = metlin
target = nist23_hr_full
if self_search:
    target=query
    
fullRun=True
if fullRun:
    os.mkdir(outputs_path)
    os.mkdir(f'{outputs_path}/intermediateOutputs')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches')

In [None]:
fullRun=False
if fullRun:

    #This should be replaced with a function to read in all the databases
    query_ = pd.read_pickle(query)
    all_bases = list(set(query_['inchi_base']))

    if self_search:
        query_.insert(0,'queryID', [i for i in range(len(query_))])
    else:
        query_.insert(0,'queryID', ["_" for i in range(len(query_))])

    #this method is in place
    np.random.shuffle(all_bases)

    first_bases = all_bases[:int(len(all_bases)*0.5)]
    second_bases = all_bases[int(len(all_bases)*0.5):int(len(all_bases)*0.7)]
    third_bases = all_bases[int(len(all_bases)*0.7):]

    first_query_ = query_[np.isin(query_['inchi_base'],first_bases)]
    first_query_.reset_index(inplace=True)
    first_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
    del(first_query_)

    second_query_ = query_[np.isin(query_['inchi_base'],second_bases)]
    second_query_.reset_index(inplace=True)
    second_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/second_query.pkl')
    del(second_query_)

    third_query_ = query_[np.isin(query_['inchi_base'],third_bases)]
    third_query_.reset_index(inplace=True)
    third_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
    del(third_query_)
    del(query_)

    
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/first_bases.npy',first_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/second_bases.npy',second_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/third_bases.npy',third_bases)
    del(first_bases)
    del(second_bases)
    del(third_bases)
    del(all_bases)


In [None]:
#Similarity methods and transformation parameters below. Leave sim methods as None to run all
reload(datasetBuilder)
reload(tools)

ppm_windows = [10]

noise_threshes=[0.01]
centroid_tolerance_vals = [0.05]
centroid_tolerance_types=['da']
powers=['orig']
sim_methods=['lorentzian','entropy','chi2','fidelity','dot_product','proportional_entropy']
prec_removes=[True]
build_dataset=False


train_size=3e6
val_size=1e6
test_size=2e6

max_matches=None
adduct_match = False

target_=pd.read_pickle(target)

if self_search:
    target_.insert(0,'queryID', [i for i in range(len(target_))])
else:
    target_.insert(0,'queryID', ["*" for i in range(len(target_))])

for i in ppm_windows:

    if build_dataset:

        #read in first bases and shuffle order
        query_train = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
        query_train=query_train.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_train,target_,i,max_matches,train_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_matches_{i}_ppm.pkl')
        del(query_train)

        
        cleaned = datasetBuilder.create_cleaned_df(
                                            matches, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        cleaned.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_{i}_ppm.pkl')

        sub_train=cleaned.iloc[:,:2]
        sub_train.columns=['query','target']
        sub_train['match']=cleaned['match']

        #read in first bases and shuffle order
        query_test = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
        query_test=query_test.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_test,target_,i,max_matches,test_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_matches_{i}_ppm.pkl')
        del(query_test)

        
        cleaned = datasetBuilder.create_cleaned_df(
                                            matches, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        cleaned.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_{i}_ppm.pkl')

        sub_test=cleaned.iloc[:,:2]
        sub_test.columns=['query','target']
        sub_test['match']=cleaned['match']


    else:
        sub_train = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_{i}_ppm.pkl')
        sub_train.columns=['query','target','match']
        
        sub_test = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_{i}_ppm.pkl')
        sub_test.columns=['query','target','match']

Func Specs

In [20]:
inds = {#'ind_lin':(['f','h'],None),
        #'ind_lin_conv':(['f','h'],{'f':(0,np.inf),'h':(1,np.inf)}),
          'ind_quad':(['f','g','h'],None),
          'ind_quad_conv':(['f','g','h'],{'f':(0,np.inf),'h':(1,np.inf)}),
          #'ind_quadk_pconv':(['f','g','h','i','j'],{'f':(0,np.inf),'h':(1,np.inf)}),
          'ind_quadk':(['f','g','h','i','j'],None)
}

lens = {'len_lin':(['k'],None),
          'len_aff':(['k','l'],None), 
          'len_affk':(['k','l','n','o'],None),                                                                   
          'len_quad':(['k','l','m'],None),
          'len_quad_conv':(['k','l','m'],{'k':(0,np.inf),'m':(1,np.inf)}),
          'len_quadk_pconv':(['k','l','m','n','o'],{'k':(0,np.inf),'m':(1,np.inf)}),
          'len_quadk':(['k','l','m','n','o'],None)}

basics = {#'tot_lin':(['a'],None),
          #'tot_aff':(['a','b'],None),   
          #'tot_affk':(['a','b','e','f'],None),                                                                     
          'tot_quad':(['a','b','c'],None),
          #'tot_quad_conv':(['a','b','c'],{'a':(0,np.inf),'c':(1,np.inf)}),
          #'tot_quadk_pconv':(['a','b','c','d','e'],{'a':(0,np.inf),'c':(1,np.inf)}),
          'tot_quadk':(['a','b','c','d','e'],None),
          #'len_lin':(['k'],None),
          #'len_aff':(['k','l'],None), 
          #'len_affk':(['k','l','n','o'],None),                                                                   
          'len_quad':(['k','l','m'],None),
          #'len_quad_conv':(['k','l','m'],{'k':(0,np.inf),'m':(1,np.inf)}),
          #'len_quadk_pconv':(['k','l','m','n','o'],{'k':(0,np.inf),'m':(1,np.inf)}),
          'len_quadk':(['k','l','m','n','o'],None),
          }


ints = {'int1_lin':(['p'],None),
          'int1_aff':(['p','q'],None), 
          'int1_affk':(['p','q','s','t'],None),                                                                       
          'int1_quad':(['p','q','r'],None),
          'int1_quad_conv':(['p','q','r'],{'p':(0,np.inf),'r':(1,np.inf)}),
          'int1_quadk_pconv':(['p','q','r','s','t'],{'p':(0,np.inf),'r':(1,np.inf)}),
          'int1_quadk':(['p','q','r','s','t'],None),
          'int2_lin':(['u'],None),
          'int2_aff':(['u','v'],None),  
          'int2_affk':(['u','v','x','y'],None),                                                                     
          'int2_quad':(['u','v','w'],None),
          'int2_quad_conv':(['u','v','w'],{'u':(0,np.inf),'w':(1,np.inf)}),
          'int2_quadk_pconv':(['u','v','w','x','y'],{'u':(0,np.inf),'w':(1,np.inf)}),
          'int2_quadk':(['u','v','w','x','y'],None),
          'int3_lin':(['z'],None),
          'int3_aff':(['z','a_'],None),  
          'int3_affk':(['z','a_','c_','d_'],None),                                                                      
          'int3_quad':(['z','a_','b_'],None),
          'int3_quad_conv':(['z','a_','b_'],{'z':(0,np.inf),'b_':(1,np.inf)}),
          'int3_quadk_pconv':(['z','a_','b_','c_','d_'],{'z':(0,np.inf),'b_':(1,np.inf)}),
          'int3_quadk':(['z','a_','b_','c_','d_'],None)
          }

For now, only test all combos of 2 feature types

In [21]:
params = dict()
seen =set()
for key in inds.keys():
    for key_ in basics.keys():

        feature_type = key.split('_')[0]
        feature_type_ = key_.split('_')[0]

        func_type = key.split('_')[1]
        func_type_ = key_.split('_')[1]

        try:
            bounds_type = key.split('_')[2]
            bounds_type_ = key_.split('_')[2]
        except:
            bounds_type = ''
            bounds_type_ = ''


        params[f'{key}_{key_}']=(inds[key][0]+basics[key_][0],testUtils.dict_combine(inds[key][1],basics[key_][1]))
        #params[f'{key}_{key_}_int']=(basics[key][0]+basics[key_][0],testUtils.dict_combine(basics[key][1]+basics[key_][1]))

params['ind_lin']=(['f','h'],None)

In [23]:
reload(func_ob)
reload(TunaSims)
reload(testUtils)
#helper lambda funcs
squared_loss = lambda x: (1-x)**2
lin_loss = lambda x: abs(1-x)
l1_reg = lambda l,x: l*np.sum(np.abs(x))
l2_reg = lambda l,x: l*np.sqrt(np.sum(x**2))
no_reg = lambda x: 0

reg_funcs = [partial(l2_reg,.1),no_reg]
reg_names = ['l2_0.1','none']
losses = [squared_loss]
loss_names = ['squared']
momentums = ['none','simple','jonie']
mom_weights = [[0.8,0.2],[0.2,0.8]]
lambdas = [0.01]
max_iters = [1e1,1e2,1e3,1e4,1e5,1e6,5e6]

funcs=testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                       reg_names=reg_names,
                                       losses=losses,
                                       loss_names=loss_names,
                                       momentums=momentums,
                                       params=params,
                                       mom_weights=mom_weights,
                                       lambdas=lambdas,
                                       max_iters=max_iters)

print(f'number of specifications: {len(funcs)}')

number of specifications: 910


In [24]:
trained = list()
sub_train = sub_train.sample(frac=1)

for i in range(len(funcs)):
    
    funcs[i].fit(sub_train)
    trained.append(funcs[i])
    print(i)

0
1


  return 1/(1 + np.exp(-z))


2
3
4
5
6
7
8
9
10


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82


KeyboardInterrupt: 

Get Train Errors

In [25]:
reload(testUtils)
reload(spectral_similarity)

#trained = trained[:12]+trained[16:]
comparison_metrics = ['entropy',
             'manhattan',
             'lorentzian',
             'dot_product',
             'fidelity',
             'proportional_manhattan',
             'max_fidelity',
             'matusita',
             'proportional_lorentzian',
             'chi2',
             'laplacian',
             'max_laplacian',
             'harmonic_mean',
             'bhattacharya_1',
             'squared_chord',
             'cross_ent'
             ]

small = testUtils.trained_res_to_df(trained,sub_train.iloc[:100000])
small_metrics = testUtils.orig_metric_to_df(comparison_metrics,sub_train.iloc[:100000])

# medium = testUtils.trained_res_to_df(trained,sub_train.iloc[:int(max_iters[1])])
# medium_metrics = testUtils.orig_metric_to_df(comparison_metrics,sub_train.iloc[:int(max_iters[1])])

# large = testUtils.trained_res_to_df(trained,sub_train.iloc[:int(max_iters[2])])
# large_metrics = testUtils.orig_metric_to_df(comparison_metrics,sub_train.iloc[:int(max_iters[2])])

0: ind_quad_tot_quad_l2_0.1_squared_none_0.01_10.0
1: ind_quad_tot_quad_l2_0.1_squared_none_0.01_100.0
2: ind_quad_tot_quad_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


3: ind_quad_tot_quad_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


4: ind_quad_tot_quad_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))


5: ind_quad_tot_quad_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


6: ind_quad_tot_quad_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


7: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_10.0
8: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_100.0
9: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


10: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


11: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))
  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


12: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))
  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


13: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))
  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


14: ind_quad_len_quad_l2_0.1_squared_none_0.01_10.0
15: ind_quad_len_quad_l2_0.1_squared_none_0.01_100.0
16: ind_quad_len_quad_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


17: ind_quad_len_quad_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


18: ind_quad_len_quad_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))


19: ind_quad_len_quad_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


20: ind_quad_len_quad_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


21: ind_quad_len_quadk_l2_0.1_squared_none_0.01_10.0
22: ind_quad_len_quadk_l2_0.1_squared_none_0.01_100.0
23: ind_quad_len_quadk_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


24: ind_quad_len_quadk_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


25: ind_quad_len_quadk_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))


26: ind_quad_len_quadk_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


27: ind_quad_len_quadk_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


28: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_10.0
29: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_100.0
30: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_1000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


31: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_10000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


32: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_100000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


33: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_1000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


34: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_5000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


35: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_10.0
36: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_100.0
37: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_1000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


38: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_10000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


39: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_100000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


40: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_1000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


41: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_5000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)
  return 1/(1 + np.exp(-z))


42: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_10.0
43: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_100.0
44: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_1000.0
45: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_10000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


46: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_100000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)
  return 1/(1 + np.exp(-z))


47: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_1000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


48: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_5000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


49: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_10.0
50: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_100.0
51: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_1000.0
52: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_10000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


53: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_100000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


54: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_1000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)
  return 1/(1 + np.exp(-z))


55: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_5000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)
  return 1/(1 + np.exp(-z))


56: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_10.0
57: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_100.0
58: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_1000.0
59: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_10000.0
60: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_100000.0
61: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_1000000.0
62: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_5000000.0
63: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_10.0
64: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_100.0
65: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_1000.0
66: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_10000.0
67: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_100000.0
68: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_1000000.0
69: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_5000000.0
70: ind_quadk_len_quad_l2_0.1_squared_none_0.01_10.0
71: ind_quadk_len_quad_l2_0.1_squared_none_0.01_100.0
72: ind_quadk_len_quad_l2_0.1_squared_none_0.01_1000.0
73: ind_quadk_len_quad_l2_0.1_squared_none_0.01_1

  return 1/(1 + np.exp(-z))


75: ind_quadk_len_quad_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


76: ind_quadk_len_quad_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


77: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_10.0
78: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_100.0
79: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_1000.0
80: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_10000.0
81: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_100000.0
82: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_1000000.0


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
  score = np.power(np.sum(q * p), 2) / (


In [26]:
small_test = testUtils.trained_res_to_df(trained,sub_test.iloc[:100000])
small_test_metrics = testUtils.orig_metric_to_df(comparison_metrics,sub_test.iloc[:100000])

# medium_test = testUtils.trained_res_to_df(trainy[mids],sub_test.iloc[int(max_iters[2]):200000])
# medium_test_metrics = testUtils.orig_metric_to_df(comparison_metrics,sub_test.iloc[int(max_iters[2]):200000])

# large_test = testUtils.trained_res_to_df(trainy[bigs],sub_test.iloc[int(max_iters[2]):200000])
# large_test_metrics = testUtils.orig_metric_to_df(comparison_metrics,sub_test.iloc[int(max_iters[2]):200000])

0: ind_quad_tot_quad_l2_0.1_squared_none_0.01_10.0
1: ind_quad_tot_quad_l2_0.1_squared_none_0.01_100.0
2: ind_quad_tot_quad_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


3: ind_quad_tot_quad_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


4: ind_quad_tot_quad_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))


5: ind_quad_tot_quad_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


6: ind_quad_tot_quad_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


7: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_10.0
8: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_100.0
9: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


10: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


11: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))
  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


12: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))
  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


13: ind_quad_tot_quadk_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))
  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


14: ind_quad_len_quad_l2_0.1_squared_none_0.01_10.0
15: ind_quad_len_quad_l2_0.1_squared_none_0.01_100.0
16: ind_quad_len_quad_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


17: ind_quad_len_quad_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


18: ind_quad_len_quad_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))


19: ind_quad_len_quad_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


20: ind_quad_len_quad_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


21: ind_quad_len_quadk_l2_0.1_squared_none_0.01_10.0
22: ind_quad_len_quadk_l2_0.1_squared_none_0.01_100.0
23: ind_quad_len_quadk_l2_0.1_squared_none_0.01_1000.0


  return 1/(1 + np.exp(-z))


24: ind_quad_len_quadk_l2_0.1_squared_none_0.01_10000.0


  return 1/(1 + np.exp(-z))


25: ind_quad_len_quadk_l2_0.1_squared_none_0.01_100000.0


  return 1/(1 + np.exp(-z))


26: ind_quad_len_quadk_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


27: ind_quad_len_quadk_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


28: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_10.0
29: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_100.0
30: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_1000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)
  return 1/(1 + np.exp(-z))
  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


31: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_10000.0
32: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_100000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


33: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_1000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


34: ind_quad_conv_tot_quad_l2_0.1_squared_none_0.01_5000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


35: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_10.0
36: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_100.0
37: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_1000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


38: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_10000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


39: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_100000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


40: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_1000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)


41: ind_quad_conv_tot_quadk_l2_0.1_squared_none_0.01_5000000.0


  terms[0] += middle(a*(total_disagreement+b)**c,d,e)
  return 1/(1 + np.exp(-z))


42: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_10.0
43: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_100.0
44: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_1000.0
45: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_10000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


46: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_100000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)
  return 1/(1 + np.exp(-z))


47: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_1000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


48: ind_quad_conv_len_quad_l2_0.1_squared_none_0.01_5000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


49: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_10.0
50: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_100.0
51: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_1000.0
52: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_10000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


53: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_100000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)


54: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_1000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)
  return 1/(1 + np.exp(-z))


55: ind_quad_conv_len_quadk_l2_0.1_squared_none_0.01_5000000.0


  terms[2] += middle(k*(disagreement_length+l)**m,n,o)
  return 1/(1 + np.exp(-z))


56: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_10.0
57: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_100.0
58: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_1000.0
59: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_10000.0
60: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_100000.0
61: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_1000000.0
62: ind_quadk_tot_quad_l2_0.1_squared_none_0.01_5000000.0
63: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_10.0
64: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_100.0
65: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_1000.0
66: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_10000.0
67: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_100000.0
68: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_1000000.0
69: ind_quadk_tot_quadk_l2_0.1_squared_none_0.01_5000000.0
70: ind_quadk_len_quad_l2_0.1_squared_none_0.01_10.0
71: ind_quadk_len_quad_l2_0.1_squared_none_0.01_100.0
72: ind_quadk_len_quad_l2_0.1_squared_none_0.01_1000.0
73: ind_quadk_len_quad_l2_0.1_squared_none_0.01_1

  return 1/(1 + np.exp(-z))


75: ind_quadk_len_quad_l2_0.1_squared_none_0.01_1000000.0


  return 1/(1 + np.exp(-z))


76: ind_quadk_len_quad_l2_0.1_squared_none_0.01_5000000.0


  return 1/(1 + np.exp(-z))


77: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_10.0
78: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_100.0
79: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_1000.0
80: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_10000.0
81: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_100000.0
82: ind_quadk_len_quadk_l2_0.1_squared_none_0.01_1000000.0


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
  score = np.power(np.sum(q * p), 2) / (


In [None]:
TunaSims.sigmoid(trained[1].trained_vals[0]*1)-TunaSims.sigmoid(trained[1].trained_vals[0]*0.5)

In [27]:
small.sort_values(by='auc', inplace=True, ascending=False)
small_test.sort_values(by='auc', inplace=True, ascending=False)

In [32]:
small[:20]

Unnamed: 0,name,reg,alpha,loss_func,momentum,weights,lambdas,max_iter,auc
58,ind_quadk_tot_quad_l2,0.1,squared,none,0.01,1000.0,0.01,1000.0,0.812493
59,ind_quadk_tot_quad_l2,0.1,squared,none,0.01,10000.0,0.01,10000.0,0.812481
60,ind_quadk_tot_quad_l2,0.1,squared,none,0.01,100000.0,0.01,100000.0,0.812416
62,ind_quadk_tot_quad_l2,0.1,squared,none,0.01,5000000.0,0.01,5000000.0,0.812287
29,ind_quad_conv_tot_quad_l2,0.1,squared,none,0.01,100.0,0.01,100.0,0.812172
57,ind_quadk_tot_quad_l2,0.1,squared,none,0.01,100.0,0.01,100.0,0.812169
61,ind_quadk_tot_quad_l2,0.1,squared,none,0.01,1000000.0,0.01,1000000.0,0.812019
7,ind_quad_tot_quadk_l2,0.1,squared,none,0.01,10.0,0.01,10.0,0.623431
21,ind_quad_len_quadk_l2,0.1,squared,none,0.01,10.0,0.01,10.0,0.623427
0,ind_quad_tot_quad_l2,0.1,squared,none,0.01,10.0,0.01,10.0,0.623068


In [31]:
set(small_test['name'])

{'ind_quad_conv_len_quad_l2',
 'ind_quad_conv_len_quadk_l2',
 'ind_quad_conv_tot_quad_l2',
 'ind_quad_conv_tot_quadk_l2',
 'ind_quad_len_quad_l2',
 'ind_quad_len_quadk_l2',
 'ind_quad_tot_quad_l2',
 'ind_quad_tot_quadk_l2',
 'ind_quadk_len_quad_l2',
 'ind_quadk_len_quadk_l2',
 'ind_quadk_tot_quad_l2',
 'ind_quadk_tot_quadk_l2'}

In [51]:
(np.array([1,2])*np.array([1,3]))# * (np.array([1,2])/np.array([1,3]))

array([1, 6])

In [30]:
small_test_metrics

Unnamed: 0,metric,AUC
0,entropy,0.791632
1,manhattan,0.798584
2,lorentzian,0.801264
3,dot_product,0.795368
4,fidelity,0.792377
5,proportional_manhattan,0.79297
6,max_fidelity,0.789883
7,matusita,0.800813
8,proportional_lorentzian,0.791576
9,chi2,0.803755


In [None]:
small_test.sort_values(by='auc', inplace=True, ascending=False)
small_test[:20]

In [None]:
small_test_metrics

In [None]:
large_test_=small_test[(small_test['reg']=='l2')&(small_test['loss_func']=='squared')&(small_test['alpha']=='0.1')]
large_test_=large_test_[['name','momentum','auc']]
large_test_.groupby(by=['name','momentum',]).mean()

In [None]:
large_test.sort_values(by='auc',ascending=False)

Conclusions: 

add offsets for terms

num of params not appearing to change train time much

consider replacing knockouts with sigmoids

consider tuning final sigmoid

should features like length,entropy be included in the similarity, or be used outside as extra feature in learned mod.both? neither?


Other Ideas:

Accuracy (In order of increasing difficulty):

-Incorporate as feature how many possible chem structures (can also restrict to NPS) exist within a certain precursor distance. (violating golden rules or not)

-include original NIST version or theoretical res as feature

-Weight different ranges of spec differently for matches (more diversity/greater accuracy)

-smush together top n results over different inchicores and come up with combined model predicting over individual inchicores

-diagnostic ion/loss classing as a feature...do they match

-kernelized smooth match

-3d struct guesses...do they match (cores, but can generalize to 3d)

Speed(In order of increasing difficulty):

-combine sim metrics and expand(apply func to df)

-exclude matches based on non-similarity features to cut down on needed comparisons

-ion tables to upper bound similarity

-only use one peak consolidation and matching protocol...then only do reweight transformations on already matched peaks for spec and sim features

-can missing peaks in lower energy be explained by frags and losses from higher energy? incorporate into model

Order to proceed:

-recreate databases with coll energy included (standardized format across DBs)

-what proportion of matches are the same coll energy?

-quantify variability in peak appearance vs peak intensity across collision energies
    -does this relate in a predictable way to fragment mass

-test sim metrics for same coll energy vs not same col energy (is the same inductive bias useful)

-Show that regular funcs are in the space of combo distance

-test combining individual metrics that use different components of the 2 vectors (add, mult, dif)

-range over individual metrics in combined score in attempt to explain why combining them is successful

-train combo metrics with flattened components and individual (should these sims be broken out?)
    -should we do this for same coll energy vs dif energies

-are different combo metrics put into larger model more successful than the combined individual metrics

-can tunasims be fit with nonlinearities between the components (flattened or not?)