In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from importlib import reload
import os
from collections import Counter
from sklearn.metrics import roc_auc_score as auc
import copy
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
import warnings
warnings.filterwarnings("ignore")

import TunaSims
import func_ob
import tools
import datasetBuilder
import testUtils
import spectral_similarity

Results for Different Ways of Distributing Interspectral Intensity Difference

In [None]:
xs=list()
scores_1=list()
scores_2=list()

total_difference = 0.9
len_difference = 10
max_len = 25

func1 = partial(TunaSims.tuna_dif_distance,f=1,g=1,h=2)
func2 = partial(TunaSims.tuna_dif_distance,f=1,g=1,h=2)

normalize = False

for i in range(1,max_len):

    xs.append(i)
    dif_1 = np.array([1/(x+1) for x in range(i)])
    dif_1 = dif_1/sum(dif_1)*total_difference

    dif_2 = np.array([total_difference/i for x in range(i)])

    if normalize:
        scores_1.append(1- 1/func1(dif_1))
        scores_2.append(1 - 1/func2(dif_2))
    else:
        scores_1.append(func1(dif_1, np.zeros(len(dif_1))))
        scores_2.append(func2(dif_2, np.zeros(len(dif_2))))

plt.plot(xs, scores_1, label='descending')
plt.plot(xs, scores_2, label='unfiorm')

plt.legend()
plt.show()

In [None]:
#databases
outputs_path='/Users/jonahpoczobutt/projects/TunaRes/testy'
nist14='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist14_highres.pkl'
nist20_prot_deprot = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist20_prot_deprot.pkl'
nist23_hr_prot_deprot_only = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_prot_deprot_only.pkl'
nist23_hr_full ='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_full.pkl'
gnps='/Users/jonahpoczobutt/projects/raw_data/db_csvs/gnps_highres.pkl'
mona='/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_highres.pkl'
metlin='/Users/jonahpoczobutt/projects/raw_data/db_csvs/metlin_highres_inst.pkl'
mona_nist = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_nist_prot_only.pkl'

self_search=False
query = metlin
target = nist23_hr_full
if self_search:
    target=query
    
fullRun=True
if fullRun:
    os.mkdir(outputs_path)
    os.mkdir(f'{outputs_path}/intermediateOutputs')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches')
    os.mkdir(f'{outputs_path}/intermediateOutputs/datasets')

In [None]:
fullRun=True
if fullRun:

    #This should be replaced with a function to read in all the databases
    query_ = pd.read_pickle(query)
    all_bases = list(set(query_['inchi_base']))

    if self_search:
        query_.insert(0,'queryID', [i for i in range(len(query_))])
    else:
        query_.insert(0,'queryID', ["_" for i in range(len(query_))])

    #this method is in place
    np.random.shuffle(all_bases)

    first_bases = all_bases[:int(len(all_bases)*0.5)]
    second_bases = all_bases[int(len(all_bases)*0.5):int(len(all_bases)*0.7)]
    third_bases = all_bases[int(len(all_bases)*0.7):]

    first_query_ = query_[np.isin(query_['inchi_base'],first_bases)]
    first_query_.reset_index(inplace=True)
    first_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
    del(first_query_)

    second_query_ = query_[np.isin(query_['inchi_base'],second_bases)]
    second_query_.reset_index(inplace=True)
    second_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/second_query.pkl')
    del(second_query_)

    third_query_ = query_[np.isin(query_['inchi_base'],third_bases)]
    third_query_.reset_index(inplace=True)
    third_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
    del(third_query_)
    del(query_)

    
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/first_bases.npy',first_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/second_bases.npy',second_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/third_bases.npy',third_bases)
    del(first_bases)
    del(second_bases)
    del(third_bases)
    del(all_bases)


In [None]:
#Similarity methods and transformation parameters below. Leave sim methods as None to run all
reload(datasetBuilder)
reload(tools)

comparison_metrics = ['entropy',
             'manhattan',
             'lorentzian',
             'dot_product',
             'fidelity',
             'matusita',
             'chi2',
             'laplacian',
             'harmonic_mean',
             'bhattacharya_1',
             'squared_chord',
             'cross_ent'
    ]

ppm_windows = [10]
noise_threshes=[0.01,0.0]
centroid_tolerance_vals = [0.05]
centroid_tolerance_types=['da']
powers=['orig',1]
sim_methods=comparison_metrics
prec_removes=[True]
build_dataset=True


train_size=3e6
test_size=1e6
test_size=2e6

max_matches=None
adduct_match = False

target_=pd.read_pickle(target)

if self_search:
    target_.insert(0,'queryID', [i for i in range(len(target_))])
else:
    target_.insert(0,'queryID', ["*" for i in range(len(target_))])

for i in ppm_windows:

    if build_dataset:

        #read in first bases and shuffle order
        query_train = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
        query_train=query_train.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_train,target_,i,max_matches,train_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_matches_{i}_ppm.pkl')
        del(query_train)

        matches_same_ce = matches[matches['ceratio']==1]
        matches_dif_ce = matches[matches['ceratio']!=1]
        
        sub_train_same_ce = datasetBuilder.create_cleaned_df(
                                            matches_same_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_train_dif_ce = datasetBuilder.create_cleaned_df(
                                            matches_dif_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_train_same_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_train_dif_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{i}_ppm.pkl')

        #read in first bases and shuffle order
        query_val = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/second_query.pkl')
        query_query_val = query_val.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_val,target_,i,max_matches,test_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_matches_{i}_ppm.pkl')
        del(query_val)

        
        matches_same_ce = matches[matches['ceratio']==1]
        matches_dif_ce = matches[matches['ceratio']!=1]
        
        sub_val_same_ce = datasetBuilder.create_cleaned_df(
                                            matches_same_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_val_dif_ce = datasetBuilder.create_cleaned_df(
                                            matches_dif_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_val_same_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_val_dif_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_dif_ce_{i}_ppm.pkl')


        #read in first bases and shuffle order
        query_test = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
        query_test=query_test.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_test,target_,i,max_matches,test_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_matches_{i}_ppm.pkl')
        del(query_test)

        matches_same_ce = matches[matches['ceratio']==1]
        matches_dif_ce = matches[matches['ceratio']!=1]
        
        sub_test_same_ce = datasetBuilder.create_cleaned_df(
                                            matches_same_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_test_dif_ce = datasetBuilder.create_cleaned_df(
                                            matches_dif_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_test_same_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_test_dif_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{i}_ppm.pkl')

    else:
        sub_train_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_val_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_test_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_same_ce_{i}_ppm.pkl')

        sub_train_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{i}_ppm.pkl')
        sub_val_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_dif_ce_{i}_ppm.pkl')
        sub_test_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{i}_ppm.pkl')


Create Train/Val/Test Data & get Individual Results

In [None]:
reload(testUtils)

train_datasets = [sub_train_same_ce, sub_train_dif_ce]
val_datasets = [sub_val_same_ce, sub_val_dif_ce]
test_datasets = [sub_test_same_ce, sub_test_dif_ce]
dataset_names = ['same_ce','dif_ce']
# train_datasets = [pd.concat((sub_train_same_ce, sub_train_dif_ce)).sample(frac=1)]
# dataset_names=['combined']

gbc_train_datasets = list()
ind_aucs_full = list()

for metric in comparison_metrics:
    for j in range(int(train_datasets[_].shape[1]/2)):

        ind_aucs_full.append(f'{metric}_{j}')

ind_aucs_full = pd.DataFrame(ind_aucs_full, columns=['metric'])

for _ in range(len(train_datasets)):

    ind_aucs_=None
    train_data_gbcs = None
    for j in range(int(train_datasets[_].shape[1]/2)):

        sub = train_datasets[_].iloc[:,2*j:2*(j+1)]
        old_cols = sub.columns
        sub.columns=['query','target']
        sub['match'] = train_datasets[_]['match'].tolist()

        ind_aucs, inds = testUtils.orig_metric_to_df(comparison_metrics, sub)
        ind_aucs_ = pd.concat((ind_aucs_, ind_aucs))
        sub = sub.iloc[:,:2]
        sub.columns=old_cols
        train_data_gbcs = pd.concat((train_data_gbcs,inds), axis=1)

    if _ ==0:    
        ind_aucs_full['same_train']=ind_aucs_['AUC'].tolist()
    else:
        ind_aucs_full['dif_train']=ind_aucs_['AUC'].tolist()

    train_data_gbcs['match'] = train_datasets[_]['match'].tolist()
    gbc_train_datasets.append(train_data_gbcs)

with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_train.pkl', 'wb') as handle:

    pickle.dump(gbc_train_datasets, handle)

del(gbc_train_datasets)
print('created train data')

gbc_val_datasets = list()
for _ in range(len(val_datasets)):

    val_data_gbcs = None
    ind_aucs_=None
    for j in range(int(val_datasets[_].shape[1]/2)):

        sub = val_datasets[_].iloc[:,2*j:2*(j+1)]
        old_cols = sub.columns
        sub.columns=['query','target']
        sub['match'] = val_datasets[_]['match'].tolist()

        ind_aucs, inds = testUtils.orig_metric_to_df(comparison_metrics, sub)
        ind_aucs_ = pd.concat((ind_aucs_, ind_aucs))
        sub = sub.iloc[:,:2]
        sub.columns=old_cols
        val_data_gbcs = pd.concat((val_data_gbcs,inds), axis=1)
    
    if _ ==0:    
        ind_aucs_full['same_val']=ind_aucs_['AUC'].tolist()
    else:
        ind_aucs_full['dif_val']=ind_aucs_['AUC'].tolist()

    val_data_gbcs['match'] = val_datasets[_]['match'].tolist()
    gbc_val_datasets.append(val_data_gbcs)

with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_val.pkl', 'wb') as handle:

    pickle.dump(gbc_val_datasets, handle)

del(gbc_val_datasets)
print('created val data')

gbc_test_datasets = list()
for _ in range(len(test_datasets)):

    test_data_gbcs = None
    ind_aucs_ = None
    for j in range(int(test_datasets[_].shape[1]/2)):

        sub = test_datasets[_].iloc[:,2*j:2*(j+1)]
        old_cols = sub.columns
        sub.columns=['query','target']
        sub['match'] = test_datasets[_]['match'].tolist()

        ind_aucs, inds = testUtils.orig_metric_to_df(comparison_metrics, sub)
        ind_aucs['metric'] = [f'{x}_{j}' for x in ind_aucs['metric']]
        ind_aucs_ = pd.concat((ind_aucs_, ind_aucs))
        sub = sub.iloc[:,:2]
        sub.columns=old_cols
        test_data_gbcs = pd.concat((test_data_gbcs,inds), axis=1)
    
    if _ ==0:    
        ind_aucs_full['same_test']=ind_aucs_['AUC'].tolist()
    else:
        ind_aucs_full['dif_test']=ind_aucs_['AUC'].tolist()

    test_data_gbcs['match'] = test_datasets[_]['match'].tolist()
    gbc_test_datasets.append(test_data_gbcs)

with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_test.pkl', 'wb') as handle:

    pickle.dump(gbc_test_datasets, handle)

del(gbc_test_datasets)
print('created test data')

Create indices to pull for each metric and those with same components, specify GBC models

In [None]:
models = [
                hgbc(),
                hgbc(learning_rate=0.5),
                hgbc(max_iter=200),
                hgbc(learning_rate=0.01,min_samples_leaf=10),
                hgbc(max_iter=200,min_samples_leaf=10),
                hgbc(learning_rate=0.5, max_iter=200,min_samples_leaf=10),
                ]


indices = dict()
indices['all-sims'] = list(range(gbc_train_datasets[0].shape[1]-1))
indices['all-mults'] = list()
indices['all-ents'] = list()
indices['all-difs'] = list()

for i in range(int((gbc_train_datasets[0].shape[1]-1)/len(comparison_metrics))):

    indices[f'setting-{i}-all'] = list(np.array(range(len(comparison_metrics)))+(i*len(comparison_metrics)))
    indices[f'mults-{i}'] = list(np.array([3,4,9])+(i*len(comparison_metrics)))
    indices[f'ents-{i}'] = list(np.array([0,11])+(i*len(comparison_metrics)))
    indices[f'difs-{i}'] = list(np.array([1,2,5,7,10])+(i*len(comparison_metrics)))

    indices[f'all-mults'] = indices['all-mults'] +list(np.array([3,4,9])+(i*len(comparison_metrics)))
    indices[f'all-ents'] =  indices['all-ents'] + list(np.array([0,11])+(i*len(comparison_metrics)))
    indices[f'all-difs'] =  indices['all-difs'] + list(np.array([1,2,5,7,10])+(i*len(comparison_metrics)))


print(f' total number of models for each: {len(models) * len(indices)}')

Train Models and Collect Train Error

In [None]:
same_train_model_aucs = list()
model_names = list()
dif_train_model_aucs = list()
trained_models = dict()

with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_train.pkl', 'rb') as handle:

    gbc_train_datasets = pickle.load(gbc_test_datasets, handle)

for key, value in indices.items():

    sub = gbc_train_datasets[0].iloc[:,value]
    models_ = copy.deepcopy(models)

    for i in range(len(models_)):

        models_[i].fit(sub,gbc_train_datasets[0]['match'])
        pos_ind = np.where(models_[i].classes_==1)[0][0]
        same_train_model_aucs.append(auc(gbc_train_datasets[0]['match'],models_[i].predict_proba(sub)[:,pos_ind]))
        model_names.append(f'{key}_{i}')
        trained_models[f'same_{key}_{i}'] = models_[i]

    sub = gbc_train_datasets[1].iloc[:,value]
    models_ = copy.deepcopy(models)

    for i in range(len(models_)):

        models_[i].fit(sub,gbc_train_datasets[1]['match'])
        pos_ind = np.where(models_[i].classes_==1)[0][0]
        dif_train_model_aucs.append(auc(gbc_train_datasets[1]['match'],models_[i].predict_proba(sub)[:,pos_ind]))
        trained_models[f'dif_{key}_{i}'] = models_[i]

model_aucs = pd.DataFrame([model_names, same_train_model_aucs, dif_train_model_aucs]).transpose()
model_aucs.columns = ['name','same_train','dif_train']

del(gbc_train_datasets)

with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_val.pkl', 'rb') as handle:

    gbc_val_datasets = pickle.load(gbc_test_datasets, handle)

same_val = list()
dif_val = list()
same_test = list()
dif_test = list()

for name in model_aucs['name'].tolist():

    subset_name = name.split('_')[0]

    sub = gbc_val_datasets[0].iloc[:,indices[subset_name]]
    model = trained_models[f'same_{name}']
    pos_ind = np.where(model.classes_==1)[0][0]
    same_val.append(auc(gbc_val_datasets[0]['match'],model.predict_proba(sub)[:,pos_ind]))

    sub = gbc_val_datasets[1].iloc[:,indices[subset_name]]
    model = trained_models[f'dif_{name}']
    pos_ind = np.where(model.classes_==1)[0][0]
    dif_val.append(auc(gbc_val_datasets[1]['match'],model.predict_proba(sub)[:,pos_ind]))

del(gbc_val_datasets)

with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_test.pkl', 'rb') as handle:

    gbc_test_datasets = pickle.load(gbc_test_datasets, handle)

for name in model_aucs['name'].tolist():

    subset_name = name.split('_')[0]

    sub = gbc_test_datasets[0].iloc[:,indices[subset_name]]
    model = trained_models[f'same_{name}']
    pos_ind = np.where(model.classes_==1)[0][0]
    same_test.append(auc(gbc_test_datasets[0]['match'],model.predict_proba(sub)[:,pos_ind]))

    sub = gbc_test_datasets[1].iloc[:,indices[subset_name]]
    model = trained_models[f'dif_{name}']
    pos_ind = np.where(model.classes_==1)[0][0]
    dif_test.append(auc(gbc_test_datasets[1]['match'],model.predict_proba(sub)[:,pos_ind]))

del(gbc_test_datasets)

model_aucs['same_val'] = same_val
model_aucs['dif_val'] = dif_val
model_aucs['same_test'] = same_test
model_aucs['dif_test'] = dif_test

model_aucs

Define Distance Functions by Features

In [None]:
flats = {
          'fdif_quadk':(['a','b','c','d','e'],None),
          'fadd_quadk':(['f','g','h','i','j'],None),
          'fmult_quadk':(['k','l','m','n','o'],None),
}

exts = {'edif_add':(['b','g','p','q','r','s','t','u','v','w'],None),
        'edif_mult':(['b','l','x','y','z','a_','b_','c_','d_','e_'],None),
        'emult_add':(['l','g','f_','g_','h_','i_','j_','k_','l_','m_'],None),      
}

params = dict()
seen =set()
for key in flats.keys():
    for key_ in flats.keys():

        feature_type = key.split('_')[0]
        feature_type_ = key_.split('_')[0]

        func_type = key.split('_')[1]
        func_type_ = key_.split('_')[1]

        try:
            bounds_type = key.split('_')[2]
            bounds_type_ = key_.split('_')[2]
        except:
            bounds_type = ''
            bounds_type_ = ''


        params[f'{key}_{key_}']=(sorted(list(set(flats[key][0]+flats[key_][0]))),testUtils.dict_combine(flats[key][1],flats[key_][1]))
        
params_ = dict()
seen =set()
for key in exts.keys():
    for key_ in exts.keys():

        feature_type = key.split('_')[0]
        feature_type_ = key_.split('_')[0]

        func_type = key.split('_')[1]
        func_type_ = key_.split('_')[1]

        try:
            bounds_type = key.split('_')[2]
            bounds_type_ = key_.split('_')[2]
        except:
            bounds_type = ''
            bounds_type_ = ''


        params_[f'{key}_{key_}']=(sorted(list(set(exts[key][0]+exts[key_][0]))),testUtils.dict_combine(exts[key][1],exts[key_][1]))

params.update(params_)   

params['all_flat']= (['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'],None)
params['all_ext'] = (['b','l','g','p','q','r','s','t','u','v','w','x','y','z','a_','b_','c_','d_','e_','f_','g_','h_','i_','j_','k_','l_','m_'],None)

for key in params.keys():
    params_[f'{key}_normed']=(sorted(list(set(params[key][0]+['n_','o_','p_','q_','r_','s_']))),testUtils.dict_combine(params[key][1],None))

params['norm_only']=(['n_','o_','p_','q_','r_','s_'],None)

Create all Func Obs

In [None]:
reload(func_ob)
reload(TunaSims)
reload(testUtils)
#helper lambda funcs
squared_loss = lambda x: (1-x)**2
lin_loss = lambda x: abs(1-x)
l1_reg = lambda l,x: l*np.sum(np.abs(x))
l2_reg = lambda l,x: l*np.sqrt(np.sum(x**2))
no_reg = lambda x: 0

reg_funcs = [partial(l2_reg,.1),no_reg]
reg_names = ['l2_0.1','none']
losses = [squared_loss]
loss_names = ['squared']
momentums = ['none','simple','jonie']
mom_weights = [[0.2,0.8]]
lambdas = [0.01]
max_iters = [1e4]

funcs_same = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                       reg_names=reg_names,
                                       losses=losses,
                                       loss_names=loss_names,
                                       momentums=momentums,
                                       params=params,
                                       mom_weights=mom_weights,
                                       lambdas=lambdas,
                                       max_iters=max_iters,
                                       func = TunaSims.tuna_combo_distance)

funcs_dif = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                       reg_names=reg_names,
                                       losses=losses,
                                       loss_names=loss_names,
                                       momentums=momentums,
                                       params=params,
                                       mom_weights=mom_weights,
                                       lambdas=lambdas,
                                       max_iters=max_iters,
                                       func = TunaSims.tuna_combo_distance)

all_funcs = [funcs_same[:2], funcs_dif[:2]]

print(f'number of specifications: {len(funcs_same)}')

In [None]:
trained_dict = dict()

for _ in range(len(train_datasets)):
    train_datasets[_]= train_datasets[_].sample(frac=1)
    for j in range(int(train_datasets[_].shape[1]/2)):

        sub = train_datasets[_].iloc[:,2*j:2*(j+1)]
        sub.columns=['query','target']
        sub['match'] = train_datasets[_]['match']
    
        trained=list()
        for i in range(len(all_funcs[_])):
            
            all_funcs[_][i].fit(sub)
            trained.append(all_funcs[_][i])
            if i+1%10==0:
                print(f'trained {i} functions on {dataset_names[_]}_{j}')

        trained_dict[f'{dataset_names[_]}_{j}'] = trained

Get AUCs

In [None]:
test_datasets = [sub_test_same_ce, sub_test_dif_ce]
#test_datasets = [pd.concat((sub_train_same_ce, sub_train_dif_ce)).sample(frac=1)]
reload(func_ob)
reload(TunaSims)
reload(testUtils)

trained_res=None
for _ in range(len(train_datasets)):
    for j in range(int(train_datasets[_].shape[1]/2)):

        #grab trained models for this portion of dataframe
        models = trained_dict[f'{dataset_names[_]}_{j}']
        sub = test_datasets[_].iloc[:,2*j:2*(j+1)]
        sub.columns=['query','target']
        sub['match'] = test_datasets[_]['match'].tolist()

        small = testUtils.trained_res_to_df(models,sub)
        small.insert(1,'settings', f'{dataset_names[_]}_{j}')
        trained_res=pd.concat((trained_res,small))



In [None]:
i = 0
while sub.iloc[i]['match']==False:
    i+=1
print(i)

In [None]:
models[0].trained_func()(sub.iloc[50]['query'],sub.iloc[50]['target'])==models[0].trained_func()(sub.iloc[500]['query'],sub.iloc[500]['target'])

In [None]:
a = models[0].trained_func()(sub.iloc[50]['query'],sub.iloc[50]['target'])

def sigmoid(z):
    return 1/(1 + np.exp(-z))

a/sigmoid(a)

In [None]:
models[0].momentum_type

In [None]:
models[0].trained_vals

In [None]:
trained_res


Conclusions: 

add offsets for terms

num of params not appearing to change train time much

consider replacing knockouts with sigmoids

consider tuning final sigmoid

should features like length,entropy be included in the similarity, or be used outside as extra feature in learned mod.both? neither?


Other Ideas:

Accuracy (In order of increasing difficulty):

-Incorporate as feature how many possible chem structures (can also restrict to NPS) exist within a certain precursor distance. (violating golden rules or not)

-include original NIST version or theoretical res as feature

-Weight different ranges of spec differently for matches (more diversity/greater accuracy)

-smush together top n results over different inchicores and come up with combined model predicting over individual inchicores

-diagnostic ion/loss classing as a feature...do they match

-kernelized smooth match

-3d struct guesses...do they match (cores, but can generalize to 3d)

Speed(In order of increasing difficulty):

-combine sim metrics and expand(apply func to df)

-exclude matches based on non-similarity features to cut down on needed comparisons

-ion tables to upper bound similarity

-only use one peak consolidation and matching protocol...then only do reweight transformations on already matched peaks for spec and sim features

-can missing peaks in lower energy be explained by frags and losses from higher energy? incorporate into model

Order to proceed:

-recreate databases with coll energy included (standardized format across DBs)

-what proportion of matches are the same coll energy?

-quantify variability in peak appearance vs peak intensity across collision energies
    -does this relate in a predictable way to fragment mass

-test sim metrics for same coll energy vs not same col energy (is the same inductive bias useful)

-Show that regular funcs are in the space of combo distance

-test combining individual metrics that use different components of the 2 vectors (add, mult, dif)

-range over individual metrics in combined score in attempt to explain why combining them is successful

-train combo metrics with flattened components and individual (should these sims be broken out?)
    -should we do this for same coll energy vs dif energies

-are different combo metrics put into larger model more successful than the combined individual metrics

-can tunasims be fit with nonlinearities between the components (flattened or not?)