In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from importlib import reload
import os
from collections import Counter
from sklearn.metrics import roc_auc_score as auc
import copy
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
import pickle
import warnings
import math
warnings.filterwarnings("ignore")

import TunaSims
import func_ob
import tools
import datasetBuilder
import testUtils
import spectral_similarity
import itertools
import reweightFuncs

In [None]:
nist14='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist14_highres.pkl'
nist20_prot = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist20_prot_fiehn_.pkl'
nist20 = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist20.pkl'
nist23_prot = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_prot_deprot_only.pkl'
nist23='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_full.pkl'
gnps='/Users/jonahpoczobutt/projects/raw_data/db_csvs/gnps_highres.pkl'
mona='/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_highres.pkl'
metlin='/Users/jonahpoczobutt/projects/raw_data/db_csvs/metlin.pkl'


Discussion Points

Should we look at inchiKey for match rather than inchiCore

does spectral entropy relate to precursor m/z - yes

does spectral entropy relate to CE - yes

does peak intensity relate to m/z - meh

does peak intensity relate to CE

What factors should play into reweighting

    -for quality measure: precursor mz
    -for reducing corr: fragment mz

does having lower correlated similarity measures produce better results

can we obtain lower correlation with the same cleaning procedure in order to be memory efficient (ie thru sim measures)

Create all Necessary Directories

In [None]:
#databases
outputs_path='/Users/jonahpoczobutt/projects/TunaRes/test'

self_search=True
query = nist20
target = nist20

if query == target:
    self_search = True
    
fullRun=True
if fullRun:
    os.mkdir(outputs_path)
    os.mkdir(f'{outputs_path}/intermediateOutputs')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/train')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/val')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/test')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/port')
    os.mkdir(f'{outputs_path}/intermediateOutputs/datasets')
    os.mkdir(f'{outputs_path}/intermediateOutputs/datasets/train')
    os.mkdir(f'{outputs_path}/intermediateOutputs/datasets/val')
    os.mkdir(f'{outputs_path}/intermediateOutputs/datasets/test')
    os.mkdir(f'{outputs_path}/gbc_res')
    os.mkdir(f'{outputs_path}/intermediateOutputs/train_to_func')
    os.mkdir(f'{outputs_path}/intermediateOutputs/train_to_error')
    

Splt Queries into Train, Val, Test by Core or Key

In [None]:
fullRun=True
match_category = 'inchi_base'
if fullRun:

    #This should be replaced with a function to read in all the databases
    query_ = pd.read_pickle(query)

    #jonah edit here
    query_ = query_[:200]
    all_bases = list(set(query_[match_category]))

    if self_search:
        query_.insert(0,'queryID', [i for i in range(len(query_))])
    else:
        query_.insert(0,'queryID', ["_" for i in range(len(query_))])

    #this method is in place
    np.random.shuffle(all_bases)

    first_bases = all_bases[:int(len(all_bases)*0.5)]
    second_bases = all_bases[int(len(all_bases)*0.5):int(len(all_bases)*0.7)]
    third_bases = all_bases[int(len(all_bases)*0.7):]

    first_query_ = query_[np.isin(query_[match_category],first_bases)]
    first_query_.reset_index(inplace=True)
    first_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/first_query.pkl')
    #del(first_query_)

    second_query_ = query_[np.isin(query_[match_category],second_bases)]
    second_query_.reset_index(inplace=True)
    second_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/second_query.pkl')
    #del(second_query_)

    third_query_ = query_[np.isin(query_[match_category],third_bases)]
    third_query_.reset_index(inplace=True)
    third_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/third_query.pkl')
    #del(third_query_)
    #del(query_)

    
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/first_bases.npy',first_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/second_bases.npy',second_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/third_bases.npy',third_bases)
    del(first_bases)
    del(second_bases)
    del(third_bases)
    del(all_bases)


Set Parameters Here!!

In [None]:
chunk_size = 1e6
adduct_match = False
strong_self_separation = False

num_chunks=1 #number of chunks to be combined for calculating correlations and collecting testable indices

label_field = 'InchiCoreMatch' # should be either inchicore or inchi

comparison_metrics = ['entropy',
                'manhattan',
                'lorentzian',
                'dot_product',
                'fidelity',
                'matusita',
                'chi2',
                'laplacian',
                'laplacian_unnorm',
                'sigmoid',
                'sigmoid_unnorm',
                'harmonic_mean',
                'bhattacharya_1',
                'squared_chord',
                'cross_ent',
                ]

ppm_windows = [3]
noise_threshes=[partial(reweightFuncs.noise_clip, perc_thresh = 0.0),
                partial(reweightFuncs.noise_clip, perc_thresh = 0.5),
                partial(reweightFuncs.noise_clip, fixed_thresh = 10)]

noise_names = ['None','5%','10']
centroid_tolerance_vals = [0.05,0.0]
centroid_tolerance_types=['da','da']
reweight_methods = [partial(reweightFuncs.logent,intercept = 0.25),reweightFuncs.weight_intensity_by_entropy,partial(reweightFuncs.fixed_power,power=1)]
reweight_names = ['logent','fiehn','1']
sim_methods=comparison_metrics
prec_removes=[lambda x: x-1.6, lambda x: None]
prec_remove_names = ['fiehn', 'none']
train_size=3e6
test_size=1e6
test_size=2e6
    

Create all Train Data

In [None]:
for i in ppm_windows:
    try:
        os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/train/{i}_ppm')
        os.mkdir(f'{outputs_path}/intermediateOutputs/datasets/train/{i}_ppm')
        os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/val/{i}_ppm')
        os.mkdir(f'{outputs_path}/intermediateOutputs/datasets/val/{i}_ppm')
        os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches/test/{i}_ppm')
        os.mkdir(f'{outputs_path}/intermediateOutputs/datasets/test/{i}_ppm')
    except:
        pass

#read in first bases and shuffle order
query_ = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/first_query.pkl')
query_ = query_.sample(frac=1)

target_=pd.read_pickle(target)
if self_search:
    target_.insert(0,'queryID', [i for i in range(len(target_))])
else:
    target_.insert(0,'queryID', ["*" for i in range(len(target_))])

datasetBuilder.create_matches_and_model_data(query_,
                              target_,
                            matchesOutputPath = f'{outputs_path}/intermediateOutputs/splitMatches/train',
                            modelDataOutputPath = f'{outputs_path}/intermediateOutputs/datasets/train',
                            chunk_size = chunk_size,
                            max_size = train_size,
                            ppm_windows = ppm_windows,
                            noise_threshes = noise_threshes,
                            noise_names = noise_names,
                            centroid_tolerance_vals = centroid_tolerance_vals,
                            centroid_tolerance_types = centroid_tolerance_types,
                            reweight_methods = reweight_methods,
                            reweight_names = reweight_names,
                            sim_methods = comparison_metrics,
                            prec_removes = prec_removes,
                            prec_remove_names = prec_remove_names
                            )

del(query_)
del(target_)

In [None]:
from sklearn.metrics.pairwise import pairwise_kernels as pk
(1-pk([[.25,.75]], [[.75,.25]], metric="sigmoid")[0][0])/(1-pk([[0,1]], [[1,0]], metric="sigmoid")[0][0])

In [None]:
pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/test/intermediateOutputs/splitMatches/train/3_ppm/chunk_1.pkl')

Create all Val Data

In [None]:
#read in second bases and shuffle order
query_ = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/second_query.pkl')
query_ = query_.sample(frac=1)

target_=pd.read_pickle(target)
if self_search:
    target_.insert(0,'queryID', [i for i in range(len(target_))])
else:
    target_.insert(0,'queryID', ["*" for i in range(len(target_))])

datasetBuilder.create_matches_and_model_data(query_,
                              target_,
                            matchesOutputPath = f'{outputs_path}/intermediateOutputs/splitMatches/val',
                            modelDataOutputPath = f'{outputs_path}/intermediateOutputs/datasets/val',
                            chunk_size = chunk_size,
                            max_size = train_size,
                            ppm_windows = ppm_windows,
                            noise_threshes = noise_threshes,
                            noise_names = noise_names,
                            centroid_tolerance_vals = centroid_tolerance_vals,
                            centroid_tolerance_types = centroid_tolerance_types,
                            reweight_methods = reweight_methods,
                            reweight_names = reweight_names,
                            sim_methods = comparison_metrics,
                            prec_removes = prec_removes,
                            prec_remove_names = prec_remove_names
                            )

del(query_)
del(target_)


Create Test Data

In [None]:
#read in second bases and shuffle order
query_ = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/inchisBySet/third_query.pkl')
query_ = query_.sample(frac=1)

target_=pd.read_pickle(target)
if self_search:
    target_.insert(0,'queryID', [i for i in range(len(target_))])
else:
    target_.insert(0,'queryID', ["*" for i in range(len(target_))])

datasetBuilder.create_matches_and_model_data(query_,
                              target_,
                            matchesOutputPath = f'{outputs_path}/intermediateOutputs/splitMatches/test',
                            modelDataOutputPath = f'{outputs_path}/intermediateOutputs/datasets/test',
                            chunk_size = chunk_size,
                            max_size = train_size,
                            ppm_windows = ppm_windows,
                            noise_threshes = noise_threshes,
                            noise_names = noise_names,
                            centroid_tolerance_vals = centroid_tolerance_vals,
                            centroid_tolerance_types = centroid_tolerance_types,
                            reweight_methods = reweight_methods,
                            reweight_names = reweight_names,
                            sim_methods = comparison_metrics,
                            prec_removes = prec_removes,
                            prec_remove_names = prec_remove_names
                            )

del(query_)
del(target_)

Create indices to pull for interesting metric combos, instantiate GBC models

In [None]:
sim_indices = datasetBuilder.generate_keep_indices(noise_threshes=[True for i in range(len(noise_threshes))],
                                                centroid_tolerance_vals = [True for i in range(len(centroid_tolerance_vals))],
                                                reweight_methods = [True for i in range(len(reweight_methods))],
                                                sim_methods = [True for i in range(len(sim_methods))],
                                                prec_removes = [True for i in range(len(prec_removes))],
                                                spec_features=[False for i in range(6)])


#get all unique pairs and unique triplets by metric
unique_pairs = list()
for j in range(len(comparison_metrics)):
    for k in range(len(comparison_metrics)):

        if j>k:
            unique_pairs.append([j,k])

unique_triplets = list()
for j in range(len(comparison_metrics)):
    for k in unique_pairs:

        if j not in k:
            unique_triplets.append([j]+k)                  

for i in ppm_windows:

    chunks=list()
    #catch case where we run out of chunks to combine
    for j in range(num_chunks):
        try:
            chunk = pd.read_pickle(f'{outputs_path}/intermediateOutputs/datasets/train/{i}_ppm/chunk_{j+1}.pkl')
            chunk = chunk.iloc[:,sim_indices]
            chunks.append(chunk)
        except:
            break

    train = pd.concat(chunks)
    del(chunk)
    del(chunks)

    models = [
            hgbc(),
            hgbc(learning_rate=0.5),
            hgbc(max_iter=200),
            hgbc(learning_rate=0.01,min_samples_leaf=10),
            hgbc(max_iter=200,min_samples_leaf=10),
            hgbc(learning_rate=0.5, max_iter=200,min_samples_leaf=10),
            ]
    
    num_condition = 10
    num_control = 20

    indices = dict()
    corrs = dict()
    indices['all-sims'] = list(range(train.shape[1]-1))
    indices['all-mults'] = list()
    indices['all-ents'] = list()
    indices['all-difs'] = list()

    low_corr_3, rand_corr_3 = testUtils.get_least_corr_and_control(train.iloc[:,:-1],
                                                                   3, 
                                                                   num_condition = num_condition,
                                                                   num_control = num_control)
    
    for _ in range(num_condition):
        indices[f'low-corr-3-all_{_}'] = low_corr_3[0][_]
        corrs[f'low-corr-3-all_{_}'] = low_corr_3[1][_]
    for _ in range(num_control):
        indices[f'rand-3-all_{_}'] = rand_corr_3[0][_]
        corrs[f'rand-3-all_{_}'] = rand_corr_3[1][_]
    print('generated 3')

    low_corr_5, rand_corr_5 = testUtils.get_least_corr_and_control(train.iloc[:,:-1],
                                                                   5, 
                                                                   num_condition = num_condition,
                                                                   num_control = num_control)
    
    for _ in range(num_condition):
        indices[f'low-corr-5-all_{_}'] = low_corr_5[0][_]
        corrs[f'low-corr-5-all_{_}'] = low_corr_5[1][_]
    for _ in range(num_control):
        indices[f'rand-5-all_{_}'] = rand_corr_5[0][_]
        corrs[f'rand-5-all_{_}'] = rand_corr_5[1][_]
    print('generated 5')

    low_corr_10, rand_corr_10 = testUtils.get_least_corr_and_control(train.iloc[:,:-1],
                                                                   10, 
                                                                   num_condition = num_condition,
                                                                   num_control = num_control)
    
    for _ in range(num_condition):
        indices[f'low-corr-10-all_{_}'] = low_corr_10[0][_]
        corrs[f'low-corr-10-all_{_}'] = low_corr_10[1][_]
    for _ in range(num_control):
        indices[f'rand-10-all_{_}'] = rand_corr_10[0][_]
        corrs[f'rand-10-all_{_}'] = rand_corr_10[1][_]
    print('generated 10')

    low_corr_15, rand_corr_15 = testUtils.get_least_corr_and_control(train.iloc[:,:-1],
                                                                   15, 
                                                                   num_condition = num_condition,
                                                                   num_control = num_control)
    
    for _ in range(num_condition):
        indices[f'low-corr-15-all_{_}'] = low_corr_15[0][_]
        corrs[f'low-corr-15-all_{_}'] = low_corr_15[1][_]
    for _ in range(num_control):
        indices[f'rand-15-all_{_}'] = rand_corr_15[0][_]
        corrs[f'rand-15-all_{_}'] = rand_corr_15[1][_]
    print('generated 15')

    low_corr_20, rand_corr_20 = testUtils.get_least_corr_and_control(train.iloc[:,:-1],
                                                                   20, 
                                                                   num_condition = num_condition,
                                                                   num_control = num_control)
    for _ in range(num_condition):
        indices[f'low-corr-20-all_{_}'] = low_corr_20[0][_]
        corrs[f'low-corr-20-all_{_}'] = low_corr_20[1][_]
    for _ in range(num_control):
        indices[f'rand-20-all_{_}'] = rand_corr_20[0][_]
        corrs[f'rand-20-all_{_}'] = rand_corr_20[1][_]
    print('generated 20')

    num_condition = 5
    num_control = 5
    for i in range(int((train.shape[1]-1)/len(comparison_metrics))):

        low_corr_3,rand_corr_3 = testUtils.get_least_corr_and_control(train.iloc[:,i*len(comparison_metrics):(i+1)*len(comparison_metrics)],3, num_condition=num_condition, num_control=num_control)
        for _ in range(num_condition):
            indices[f'low-corr-3-{i}_{_}'] = low_corr_3[0][_]+(i*len(comparison_metrics))
            corrs[f'low-corr-3-{i}_{_}'] = low_corr_3[1][_]+(i*len(comparison_metrics))
        for _ in range(num_control):
            indices[f'rand-3-{i}_{_}'] = rand_corr_3[0][_]+(i*len(comparison_metrics))
            corrs[f'rand-3-{i}_{_}'] = rand_corr_3[1][_]+(i*len(comparison_metrics))

        low_corr_5,rand_corr_5= testUtils.get_least_corr_and_control(train.iloc[:,i*len(comparison_metrics):(i+1)*len(comparison_metrics)],5, num_condition=num_condition, num_control=num_control)
        
        for _ in range(num_condition):
            indices[f'low-corr-5-{i}_{_}'] = low_corr_5[0][_]+(i*len(comparison_metrics))
            corrs[f'low-corr-5-{i}_{_}'] = low_corr_5[1][_]+(i*len(comparison_metrics))
        for _ in range(num_control):
            indices[f'rand-5-{i}_{_}'] = rand_corr_5[0][_]+(i*len(comparison_metrics))
            corrs[f'rand-5-{i}_{_}'] = rand_corr_5[1][_]+(i*len(comparison_metrics))

        indices[f'all-setting-{i}'] = list(np.array(range(len(comparison_metrics)))+(i*len(comparison_metrics)))
        indices[f'mults-{i}'] = list(np.array([3,4,9,11])+(i*len(comparison_metrics)))
        indices[f'difs-{i}'] = list(np.array([1,2,5,7,10])+(i*len(comparison_metrics)))

        indices[f'all-mults'] = indices['all-mults'] + list(np.array([3,4,9,11])+(i*len(comparison_metrics)))
        indices[f'all-ents'] = indices['all-ents'] + list(np.array([0])+(i*len(comparison_metrics)))
        indices[f'all-difs'] = indices['all-difs'] + list(np.array([1,2,5,7,10])+(i*len(comparison_metrics)))

        for _ in range(len(unique_pairs)):
            indices[f'pair_{i}_{_}'] = np.array(unique_pairs[_])+(i*len(comparison_metrics))

        for _ in range(len(unique_triplets)):
            indices[f'triplet_{i}_{_}'] = np.array(unique_triplets[_])+(i*len(comparison_metrics))

    print('finished creating indices')
    
    #now populate correlation dictionary with anything we don't already have
    corr_matrix = train.corr()
    for key, value in indices.items():

        if key not in corrs:
            
            corr = 0
            for i in value:
                for j in value:

                    if i>j:
                        corr += corr_matrix.iloc[i,j]/math.comb(len(value),2)

            corrs[key] = corr

    with open(f'{outputs_path}/intermediateOutputs/gbc_res/custom_indices_{ppm_windows[0]}_ppm.pkl', 'wb') as handle:

        pickle.dump(indices,handle)

    with open(f'{outputs_path}/intermediateOutputs/gbc_res/mean_correlations_{ppm_windows[0]}_ppm.pkl', 'wb') as handle:

        pickle.dump(corrs,handle)

    print(f' total number of models: {len(models) * len(indices)}')
    del(indices)
    del(corrs)

breakpoint

train all models, collecting input aucs, their correlations, and their combined performance

Train Models and Collect Train Error

In [None]:
def train_and_name_models(train, models, indices):

    trained_models = list()
    for key, value in indices.items():

        sub = train.iloc[:,value]
        models_ = copy.deepcopy(models)

        for i in range(len(models_)):

            models_[i].fit(sub,train['match'])
            trained_models[f'{key}_{i}'] = models_[i]

    return trained_models

def evaluate_models_by_subset(models, indices, eval_data):

    model_aucs = list()
    model_names = sorted(list(models.keys()))
    for name in model_names:

        subset_name = name.split('_')[0]

        sub = eval_data.iloc[:,indices[subset_name]]
        model = models[name]
        pos_ind = np.where(model.classes_==1)[0][0]
        model_aucs.append(auc(val['match'],model.predict_proba(sub)[:,pos_ind]))

    return model_aucs

In [None]:
for window in ppm_windows:

    chunks=list()
    #catch case where we run out of chunks to combine
    labels = list()
    for j in range(num_chunks):
        try:
            chunk = pd.read_pickle(f'{outputs_path}/intermediateOutputs/datasets/train/{i}_ppm/chunk_{j+1}.pkl')
            chunk = chunk.iloc[:,sim_indices]
            labels = labels + chunk[label_field].tolist()
            chunks.append(chunk)
        except:
            break

    train = pd.concat(chunks)
    train['match'] = labels
    del(chunk)
    del(chunks)
     
    trained_models = train_and_name_models(train, models, indices)
    names = sorted(list(trained_models.keys()))
    train_aucs = evaluate_models_by_subset(trained_models, indices, train)
    del(train)

    chunks=list()
    #catch case where we run out of chunks to combine
    labels = list()
    for j in range(num_chunks):
        try:
            chunk = pd.read_pickle(f'{outputs_path}/intermediateOutputs/datasets/val/{i}_ppm/chunk_{j+1}.pkl')
            chunk = chunk.iloc[:,sim_indices]
            labels = labels + chunk[label_field].tolist()
            chunks.append(chunk)
        except:
            break

    val = pd.concat(chunks)
    val['match'] = labels
    del(chunk)
    del(chunks)

    val_aucs = evaluate_models_by_subset(trained_models, indices, val)
    del(val)

    chunks=list()
    #catch case where we run out of chunks to combine
    labels = list()
    for j in range(num_chunks):
        try:
            chunk = pd.read_pickle(f'{outputs_path}/intermediateOutputs/datasets/test/{i}_ppm/chunk_{j+1}.pkl')
            chunk = chunk.iloc[:,sim_indices]
            labels = labels + chunk[label_field].tolist()
            chunks.append(chunk)
        except:
            break

    test = pd.concat(chunks)
    test['match'] = labels
    del(chunk)
    del(chunks)

    test_aucs = evaluate_models_by_subset(trained_models, indices, test)
    del(val)

    model_aucs = pd.DataFrame([names, train_aucs, val_aucs, test_aucs], columns=['name','train','val','test'])
    
    pd.to_csv(f'{outputs_path}/gbc_res/model_aucs_{window}_ppm.csv')
    del(model_aucs)

Train Functions to original metrics, evaluate how far off we are on test data with original normalization

In [None]:
inits = {'a' : 1,
        'b': 1,
        'c' : 1,
        'd' : -1,
        'e' : 1,
        'f' : 1,
        'g' :1,
        'h' :0,
        'i' : -1,
        'j' :1,
        'k' : 1,
        'l' : 1,
        'm' : 1,
        'n' : -1,
        'o' : 1,
        'p' : 1,
        'q' : 1,
        'r' : 0,
        's' : 1,
        't' : -1,
        'u' : 1,
        'v' : -1,
        'w' : 1,
        'x' : 1,
        'y' : 1,
        'z' : 1,
        'a_' : 1,
        'b_' : -1,
        'c_' : 1,
        'd_' : -1,
        'e_' : 1,
        'f_' : 1,
        'g_' : 1,
        'h_' : 1,
        'i_' : 1,
        'j_' : -1,
        'k_' : 1,
        'l_': -1,
        'm_' : 1,
        'n_' : 1,
        'o_' : 1,
        'p_' : 1,
        'q_' : -1,
        'r_' : 1,
        's_' : -1,
        't_' : 1,
        'u_' : 1,
        'v_' : 1,
        'w_' : -1,
        'x_' : 1,
        'y_' : -1,
        'z_':1}


fit_funcs = {
    'entropy-1':(['f','g','i','n_','p_'],None),
    'entropy-2':(['f','g','h','i','j','n_','o_','p_','q_','r_','s_'],None),
    'entropy-3':(['f','g','h','i','j','n_','o_','p_','q_','r_','s_','b','l','x','y','z','a_','b_','c_','d_','e_'],None),
    'lorentzian-1':(['a','b'],None),
    'lorentzian-2':(['a','b','c','d','e'],None),
    'lorentzian-3':(['a','b','c','d','e','f','g','h','i','j','k','n_','o_','p_','q_','r_','s_'],None),
    'dot_product-1':(['k','l','t_','u_'],None),
    'dot_product-2':(['k','l','m','n','o','t_','u_','v_','w_','x_','y_'],None),
    'dot_product-3':(['k','l','m','n','o','t_','u_','v_','w_','x_','y_','k','n_','o_','p_','q_','r_','s_'],None),
    'harmonic_mean-1':(['x','y','z','a_','b_','c_'],None),
    'harmonic_mean-2':(['b','l','x','y','z','a_','b_','c_','d_','e_','b','l'],None),
    'harmonic_mean-3':(['b','l','x','y','z','a_','b_','c_','d_','e_','b','l','n_','o_','p_','q_','r_','s_'],None),
    'fidelity-1':(['k','l','m'],None),
    'fidelity-2':(['k','l','m','n','o',],None),
    'fidelity-3':(['k','l','m','n','o','n_','o_','p_','q_','r_','s_'],None),
    'squared_chord-1':(['a','b'],None),
    'squared_chord-2':(['a','b','c','d','e'],None),
    'squared_chord-3':(['a','b','c','d','e','f','g','h','i','j','k','n_','o_','p_','q_','r_','s_'],None),
    'bhattacharya_1-1':(['a','b'],None),
    'bhattacharya_1-2':(['a','b','c','d','e'],None),
    'bhattacharya_1-3':(['a','b','c','d','e','f','g','h','i','j','k','n_','o_','p_','q_','r_','s_'],None),
}

reload(testUtils)
reload(func_ob)
reload(TunaSims)

train_reses = list()
test_reses = list()
for i in ppm_windows:

    with open(f'{outputs_path}/intermediateOutputs/datasets/train_unnorm_dist_{i}_ppm.pkl', 'rb') as handle:

        train_labels = pickle.load(handle)


    with open(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{i}_ppm.pkl', 'rb') as handle:

        train_specs = pickle.load(handle)

    #just focus on first setting for now
    train_labels = train_labels.iloc[:,:len(comparison_metrics)]
    train_specs = train_specs.iloc[:,[0,1,2,-2]]
    train_specs.columns=['mzs','query','target','precursor']

    with open(f'{outputs_path}/intermediateOutputs/datasets/test_unnorm_dist_{i}_ppm.pkl', 'rb') as handle:

        test_labels = pickle.load(handle)

    with open(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{i}_ppm.pkl', 'rb') as handle:

        test_specs = pickle.load(handle)

    test_labels = test_labels.iloc[:,:len(comparison_metrics)]
    test_specs = test_specs.iloc[:,[0,1,2,-2]]
    test_specs.columns=['mzs','query','target','precursor']

    squared_loss = lambda x: (x)**2
    lin_loss = lambda x: np.abs(x)
    l1_reg = lambda l,x: l*np.sum(np.abs(x))
    l2_reg = lambda l,x: l*np.sqrt(np.sum(x**2))
    no_reg = lambda x: 0

    reg_funcs = [no_reg,partial(l2_reg,0.01),partial(l1_reg,0.01)]
    reg_names = ['none','l2_0.01','l1_0.01']
    losses = [squared_loss]
    loss_names = ['squared']
    momentums = ['none','simple']
    mom_weights = [[0.2,0.8]]
    lambdas = [0.01,0.001]
    max_iters = [1e5]

    funcs = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                        reg_names=reg_names,
                                        losses=losses,
                                        loss_names=loss_names,
                                        momentums=momentums,
                                        inits = inits,
                                        params=fit_funcs,
                                        mom_weights=mom_weights,
                                        lambdas=lambdas,
                                        max_iters=max_iters,
                                        func = TunaSims.tuna_combo_distance_demo)
    
    print(f'total number of functions : {len(funcs)}')
    trained=list()
    for func in funcs:

        name = func.name.split('-')[0]
        train_specs['match'] = train_labels[name]

        func.fit(train_specs)
        trained.append(func)
        print(func.name)

    #get train and test errors under proper normalization protocol
    trained_res=list()
    test_res=list()
    names=list()
reload(testUtils)
for i in ppm_windows:
    for func in trained:

        #generate proper train and test datasets
        name = func.name.split('-')[0]
        train_specs['match'] = train_labels[name]
        test_specs['match'] = test_labels[name]

        #get trained_func
        pred_func = func.trained_func()

        trained_res.append(testUtils.get_func_dist(train_specs, pred_func, name))
        test_res.append(testUtils.get_func_dist(test_specs, pred_func, name))
        names.append(f'{name}_{func.regularization_name}_{func.momentum_type}_{func.momentum_weights}')
        print(f'{name}_{func.regularization_name}_{func.momentum_type}_{func.momentum_weights}')
        

    trained_res = pd.DataFrame(trained_res).transpose()
    trained_res.columns  = names

    test_res = pd.DataFrame(test_res).transpose()
    test_res.columns  = names

    train_reses.append(trained_res)
    test_reses.append(test_res)

with open(f'{outputs_path}/intermediateOutputs/train_to_func/trained_reses_{i}_ppm.pkl', 'wb') as handle:

    pickle.dump(train_reses, handle)

with open(f'{outputs_path}/intermediateOutputs/train_to_func/test_reses_{i}_ppm.pkl', 'wb') as handle:

    pickle.dump(test_reses, handle)

del(train_reses)
del(test_reses)
del(train_labels)
del(test_labels)
del(train_specs)
del(test_specs)

In [None]:
with open(f'{outputs_path}/intermediateOutputs/datasets/train_unnorm_dist_3_ppm.pkl', 'rb') as handle:

    train_labels = pickle.load(handle)

with open(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_3_ppm.pkl', 'rb') as handle:

    train_specs = pickle.load(handle)

comparison_metrics = ['entropy',
                'manhattan',
                'lorentzian',
                'dot_product',
                'fidelity',
                'matusita',
                'chi2',
                'laplacian',
                'harmonic_mean',
                'bhattacharya_1',
                'squared_chord',
                'cross_ent']

#just focus on first setting for now
train_labels = train_labels.iloc[:,:len(comparison_metrics)]
train_specs = train_specs.iloc[:,[0,1,2,-2]]
train_specs.columns=['mzs','query','target','precursor']

In [None]:
train_specs

Define Distance Functions by Features

In [None]:
quadk = False
if quadk:
    flats = {
            'fdif_quadk':(['a','b','c','d','e'],None),
            'fadd_quadk':(['f','g','h','i','j'],None),
            'fmult_quadk':(['k','l','m','n','o'],None),
    }

    exts = {'edif_add':(['b','g','p','q','r','s','t','u','v','w'],None),
            'edif_mult':(['b','l','x','y','z','a_','b_','c_','d_','e_'],None),
            'emult_add':(['l','g','f_','g_','h_','i_','j_','k_','l_','m_'],None),      
    }

    params = dict()
    seen =set()
    for key in flats.keys():
        for key_ in flats.keys():

            feature_type = key.split('_')[0]
            feature_type_ = key_.split('_')[0]

            func_type = key.split('_')[1]
            func_type_ = key_.split('_')[1]

            try:
                bounds_type = key.split('_')[2]
                bounds_type_ = key_.split('_')[2]
            except:
                bounds_type = ''
                bounds_type_ = ''

            if f'{key_}_{key}' in params.keys():
                continue
            params[f'{key}_{key_}']=(sorted(list(set(flats[key][0]+flats[key_][0]))),testUtils.dict_combine(flats[key][1],flats[key_][1]))
            
    params_ = dict()
    seen =set()
    for key in exts.keys():
        for key_ in exts.keys():

            feature_type = key.split('_')[0]
            feature_type_ = key_.split('_')[0]

            func_type = key.split('_')[1]
            func_type_ = key_.split('_')[1]

            try:
                bounds_type = key.split('_')[2]
                bounds_type_ = key_.split('_')[2]
            except:
                bounds_type = ''
                bounds_type_ = ''

            if f'{key_}_{key}' in params_.keys():
                continue
            params_[f'{key}_{key_}']=(sorted(list(set(exts[key][0]+exts[key_][0]))),testUtils.dict_combine(exts[key][1],exts[key_][1]))

    params.update(params_)   

    params['all_flat_quadk']= (['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'],None)
    params['all_ext_quadk'] = (['b','l','g','p','q','r','s','t','u','v','w','x','y','z','a_','b_','c_','d_','e_','f_','g_','h_','i_','j_','k_','l_','m_','t_','u_','v_','w_','x_','y_'],None)

    for key in list(params_.keys())[:5]:
        params[f'{key}_normed_add']=(sorted(list(set(params_[key][0]+['n_','o_','p_','q_','r_','s_']))),testUtils.dict_combine(params_[key][1],None))
        params[f'{key}_normed_mult']=(sorted(list(set(params_[key][0]+['t_','u_','v_','w_','x_','y_']))),testUtils.dict_combine(params_[key][1],None))

    params['norm_only_add']=(['n_','o_','p_','q_','r_','s_'],None)
    params['norm_only_mult']=(['t_','u_','v_','w_','x_','y_'],None)

quad=True
if quad:
    flats = {
            'fdif_quad':(['a','b','c'],None),
            'fadd_quad':(['f','g','h'],None),
            'fmult_quad':(['k','l','m'],None),
    }

    exts = {'edif_add_quad':(['b','g','p','q','r','s','t','u'],None),
            'edif_mult_quad':(['b','l','x','y','z','a_','b_','c_'],None),
            'emult_add_quad':(['l','g','f_','g_','h_','i_','j_','k_'],None),      
    }

    params2 = dict()
    seen =set()
    for key in flats.keys():
        for key_ in flats.keys():

            feature_type = key.split('_')[0]
            feature_type_ = key_.split('_')[0]

            func_type = key.split('_')[1]
            func_type_ = key_.split('_')[1]

            try:
                bounds_type = key.split('_')[2]
                bounds_type_ = key_.split('_')[2]
            except:
                bounds_type = ''
                bounds_type_ = ''

            if f'{key_}_{key}' in params2.keys():
                continue
            params2[f'{key}_{key_}']=(sorted(list(set(flats[key][0]+flats[key_][0]))),testUtils.dict_combine(flats[key][1],flats[key_][1]))
            
    params2_ = dict()
    seen =set()
    for key in exts.keys():
        for key_ in exts.keys():

            feature_type = key.split('_')[0]
            feature_type_ = key_.split('_')[0]

            func_type = key.split('_')[1]
            func_type_ = key_.split('_')[1]

            try:
                bounds_type = key.split('_')[2]
                bounds_type_ = key_.split('_')[2]
            except:
                bounds_type = ''
                bounds_type_ = ''

            if f'{key_}_{key}' in params2_.keys():
                continue
            params2_[f'{key}_{key_}']=(sorted(list(set(exts[key][0]+exts[key_][0]))),testUtils.dict_combine(exts[key][1],exts[key_][1]))

    params2['all_flat_quad']= (['a','b','c','f','g','h','k','l','m'],None)
    params2['all_ext_quad'] = (['b','l','g','p','q','r','s','t','u','x','y','z','a_','b_','c_','f_','g_','h_','i_','j_','k_'],None)


    for key in list(params2_.keys())[:5]:
        params2[f'{key}_normed_add']=(sorted(list(set(params2_[key][0]+['n_','o_','p_','s_']))),testUtils.dict_combine(params2_[key][1],None))
        params2[f'{key}_normed_mult']=(sorted(list(set(params2_[key][0]+['t_','u_','v_','w_','x_','y_']))),testUtils.dict_combine(params2_[key][1],None))

    params2.update(params2_) 
    #params.update(params2)  

    for key in list(params2.keys())[:10]:
        params[f'{key}_sigtune']=(params[key][0]+['z_'],None)

    # for key in list(params2.keys())[:10]:
    #     params[f'{key}_with_mz']=(params[key][0]+['z_'],None)

    reload(func_ob)
    reload(TunaSims)
    reload(testUtils)
    #helper lambda funcs
    squared_loss = lambda x: (1-x)**2
    lin_loss = lambda x: np.abs(1-x)
    l1_reg = lambda l,x: l*np.sum(np.abs(x))
    l2_reg = lambda l,x: l*np.sqrt(np.sum(x**2))
    no_reg = lambda x: 0

    reg_funcs = [no_reg,partial(l2_reg,0.01),partial(l2_reg,0.1)]
    reg_names = ['none_none','l2_0.01','l2_0.1']
    losses = [squared_loss]
    loss_names = ['squared']
    momentums = ['none']
    mom_weights = [[0.2,0.8]]
    lambdas = [0.01]
    max_iters = [1e4]

    funcs_same = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                        reg_names=reg_names,
                                        losses=losses,
                                        loss_names=loss_names,
                                        momentums=momentums,
                                        params=params2,
                                        inits=inits,
                                        mom_weights=mom_weights,
                                        lambdas=lambdas,
                                        max_iters=max_iters,
                                        func = TunaSims.tuna_combo_distance)

    funcs_dif = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                        reg_names=reg_names,
                                        losses=losses,
                                        loss_names=loss_names,
                                        momentums=momentums,
                                        params=params2,
                                        inits=inits,
                                        mom_weights=mom_weights,
                                        lambdas=lambdas,
                                        max_iters=max_iters,
                                        func = TunaSims.tuna_combo_distance)

    all_funcs_ = [funcs_same, funcs_dif]

    print(f'number of specifications: {len(funcs_same)}')


In [None]:
for window in ppm_windows:

    trained_dict = dict()
    all_funcs = copy.deepcopy(all_funcs_)

    sub_train_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_train_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{window}_ppm.pkl')
    
    train_datasets = [sub_train_same_ce, sub_train_dif_ce]
    dataset_names = ['same_ce','dif_ce']

    settings = 1

    for _ in range(len(train_datasets)):
        
        for j in range(settings):

            sub = train_datasets[_].iloc[:,3*j:3*(j+1)]
            sub.columns=['mzs','query','target']
            sub['precursor'] = train_datasets[_]['precursor']
            sub['match'] = train_datasets[_]['match']
        
            trained=list()
            for i in range(len(all_funcs[_])):
                
                all_funcs[_][i].fit(sub)
                trained.append(all_funcs[_][i])
                if (i+1)%10==0:
                    print(f'trained {i+1} functions on {dataset_names[_]}_{j}')


            trained_dict[f'{dataset_names[_]}_{j}'] = trained

    with open(f'{outputs_path}/intermediateOutputs/train_to_error/trained_dict_{window}_ppm.pkl', 'wb') as handle:

        pickle.dump(trained_dict, handle)
        del(trained_dict)
    

    

Get AUCs

In [None]:
for window in ppm_windows:
    
    sub_train_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_train_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{window}_ppm.pkl')
    
    train_datasets = [sub_train_same_ce, sub_train_dif_ce]
    dataset_names = ['same_ce','dif_ce']

    settings=1

    trained_res=None
    for _ in range(len(train_datasets)):
        for j in range(settings):

            #grab trained models for this portion of dataframe
            sub = train_datasets[_].iloc[:,3*j:3*(j+1)]
            sub.columns=['mzs','query','target']
            sub['precursor'] = train_datasets[_]['precursor']
            sub['match'] = train_datasets[_]['match']

            small = testUtils.trained_res_to_df(models,sub)
            small.insert(1,'settings', f'{dataset_names[_]}_{j}')
            trained_res=pd.concat((trained_res,small))
            print(f'completed {dataset_names[_]}_{j}')

    print('generated train results')
    del(sub_train_dif_ce)
    del(sub_train_same_ce)

    sub_val_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_val_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_dif_ce_{window}_ppm.pkl')
    val_datasets = [sub_val_same_ce, sub_val_dif_ce]

    val_aucs=list()
    for _ in range(len(val_datasets)):
        for j in range(settings):

            #grab trained models for this portion of dataframe
            models = trained_dict[f'{dataset_names[_]}_{j}']
            sub = val_datasets[_].iloc[:,3*j:3*(j+1)]
            sub.columns=['mzs','query','target']
            sub['precursor'] = val_datasets[_]['precursor']
            sub['match'] = val_datasets[_]['match']
            val_aucs = val_aucs + testUtils.trained_res_to_df(models,sub)['auc'].tolist()
            print(f'completed {dataset_names[_]}_{j}')

    trained_res['val']=val_aucs
    print('generated val results')

    del(sub_val_dif_ce)
    del(sub_val_same_ce)

    sub_test_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_test_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{window}_ppm.pkl')
    test_datasets = [sub_test_same_ce, sub_test_dif_ce]
    
    test_aucs=list()
    for _ in range(len(test_datasets)):
        for j in range(settings):

            #grab trained models for this portion of dataframe
            models = trained_dict[f'{dataset_names[_]}_{j}']
            sub = test_datasets[_].iloc[:,3*j:3*(j+1)]
            sub.columns=['mzs','query','target']
            sub['precursor'] = test_datasets[_]['precursor']
            sub['match'] = test_datasets[_]['match'].tolist()
            test_aucs = test_aucs + testUtils.trained_res_to_df(models,sub)['auc'].tolist()
            print(f'completed {dataset_names[_]}_{j}')

    trained_res['test']=test_aucs
    print('generated test results')

    del(sub_test_dif_ce)
    del(sub_test_same_ce)

    with open(f'{outputs_path}/intermediateOutputs/train_to_error/trained_res_{window}_ppm.pkl', 'wb') as handle:

        pickle.dump(trained_res, handle)
        del(trained_res)

Conclusions: 

add offsets for terms

num of params not appearing to change train time much

consider replacing knockouts with sigmoids

consider tuning final sigmoid

should features like length,entropy be included in the similarity, or be used outside as extra feature in learned mod.both? neither?


Other Ideas:

Accuracy (In order of increasing difficulty):

-Incorporate as feature how many possible chem structures (can also restrict to NPS) exist within a certain precursor distance. (violating golden rules or not)

-include original NIST version or theoretical res as feature

-Weight different ranges of spec differently for matches (more diversity/greater accuracy)

-smush together top n results over different inchicores and come up with combined model predicting over individual inchicores

-diagnostic ion/loss classing as a feature...do they match

-kernelized smooth match

-3d struct guesses...do they match (cores, but can generalize to 3d)

Speed(In order of increasing difficulty):

-combine sim metrics and expand(apply func to df)

-exclude matches based on non-similarity features to cut down on needed comparisons

-ion tables to upper bound similarity

-only use one peak consolidation and matching protocol...then only do reweight transformations on already matched peaks for spec and sim features

-can missing peaks in lower energy be explained by frags and losses from higher energy? incorporate into model

Order to proceed:

-recreate databases with coll energy included (standardized format across DBs)

-what proportion of matches are the same coll energy?

-quantify variability in peak appearance vs peak intensity across collision energies
    -does this relate in a predictable way to fragment mass

-test sim metrics for same coll energy vs not same col energy (is the same inductive bias useful)

-Show that regular funcs are in the space of combo distance

-test combining individual metrics that use different components of the 2 vectors (add, mult, dif)

-range over individual metrics in combined score in attempt to explain why combining them is successful

-train combo metrics with flattened components and individual (should these sims be broken out?)
    -should we do this for same coll energy vs dif energies

-are different combo metrics put into larger model more successful than the combined individual metrics

-can tunasims be fit with nonlinearities between the components (flattened or not?)