In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from importlib import reload
import os
from collections import Counter
from sklearn.metrics import roc_auc_score as auc
import copy
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
import pickle
import warnings
warnings.filterwarnings("ignore")

import TunaSims
import func_ob
import tools
import datasetBuilder
import testUtils
import spectral_similarity

Results for Different Ways of Distributing Interspectral Intensity Difference

In [2]:
#databases
outputs_path='/Users/jonahpoczobutt/projects/TunaRes/test'
nist14='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist14_highres.pkl'
nist20_prot_deprot = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist20_prot_deprot.pkl'
nist23_hr_prot_deprot_only = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_prot_deprot_only.pkl'
nist23_hr_full ='/Users/jonahpoczobutt/projects/raw_data/db_csvs/nist23_full.pkl'
gnps='/Users/jonahpoczobutt/projects/raw_data/db_csvs/gnps_highres.pkl'
mona='/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_highres.pkl'
metlin='/Users/jonahpoczobutt/projects/raw_data/db_csvs/metlin_highres_inst.pkl'
mona_nist = '/Users/jonahpoczobutt/projects/raw_data/db_csvs/mona_nist_prot_only.pkl'

self_search=False
query = metlin
target = nist23_hr_full
if self_search:
    target=query
    
fullRun=True
if fullRun:
    os.mkdir(outputs_path)
    os.mkdir(f'{outputs_path}/intermediateOutputs')
    os.mkdir(f'{outputs_path}/intermediateOutputs/splitMatches')
    os.mkdir(f'{outputs_path}/intermediateOutputs/datasets')
    os.mkdir(f'{outputs_path}/intermediateOutputs/gbc_res')
    os.mkdir(f'{outputs_path}/intermediateOutputs/train_to_func')
    os.mkdir(f'{outputs_path}/intermediateOutputs/train_to_error')

In [3]:
fullRun=True
if fullRun:

    #This should be replaced with a function to read in all the databases
    query_ = pd.read_pickle(query)
    all_bases = list(set(query_['inchi_base']))

    if self_search:
        query_.insert(0,'queryID', [i for i in range(len(query_))])
    else:
        query_.insert(0,'queryID', ["_" for i in range(len(query_))])

    #this method is in place
    np.random.shuffle(all_bases)

    first_bases = all_bases[:int(len(all_bases)*0.5)]
    second_bases = all_bases[int(len(all_bases)*0.5):int(len(all_bases)*0.7)]
    third_bases = all_bases[int(len(all_bases)*0.7):]

    first_query_ = query_[np.isin(query_['inchi_base'],first_bases)]
    first_query_.reset_index(inplace=True)
    first_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
    del(first_query_)

    second_query_ = query_[np.isin(query_['inchi_base'],second_bases)]
    second_query_.reset_index(inplace=True)
    second_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/second_query.pkl')
    del(second_query_)

    third_query_ = query_[np.isin(query_['inchi_base'],third_bases)]
    third_query_.reset_index(inplace=True)
    third_query_.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
    del(third_query_)
    del(query_)

    
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/first_bases.npy',first_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/second_bases.npy',second_bases)
    np.save(f'{outputs_path}/intermediateOutputs/splitMatches/third_bases.npy',third_bases)
    del(first_bases)
    del(second_bases)
    del(third_bases)
    del(all_bases)


In [4]:
#Similarity methods and transformation parameters below. Leave sim methods as None to run all
reload(datasetBuilder)
reload(tools)

comparison_metrics = ['entropy',
             'manhattan',
             'lorentzian',
             'dot_product',
             'fidelity',
             'matusita',
             'chi2',
             'laplacian',
             'harmonic_mean',
             'bhattacharya_1',
             'squared_chord',
             'cross_ent'
    ]

ppm_windows = [10]
noise_threshes=[0.01,0.0]
centroid_tolerance_vals = [0.05]
centroid_tolerance_types=['da']
powers=['orig',1]
sim_methods=comparison_metrics
prec_removes=[True]
build_dataset=True


train_size=3e6
test_size=1e6
test_size=2e6

max_matches=None
adduct_match = False

target_=pd.read_pickle(target)

if self_search:
    target_.insert(0,'queryID', [i for i in range(len(target_))])
else:
    target_.insert(0,'queryID', ["*" for i in range(len(target_))])

for i in ppm_windows:

    if build_dataset:

        #read in first bases and shuffle order
        query_train = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/first_query.pkl')
        query_train=query_train.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_train,target_,i,max_matches,train_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_matches_{i}_ppm.pkl')
        del(query_train)

        matches_same_ce = matches[matches['ceratio']==1]
        matches_dif_ce = matches[matches['ceratio']!=1]
        
        sub_train_same_ce = datasetBuilder.create_cleaned_df(
                                            matches_same_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_train_dif_ce = datasetBuilder.create_cleaned_df(
                                            matches_dif_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )


        sub_train_same_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_train_dif_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{i}_ppm.pkl')

        del(sub_train_same_ce)
        del(sub_train_dif_ce)
        #read in first bases and shuffle order
        query_val = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/second_query.pkl')
        query_query_val = query_val.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_val,target_,i,max_matches,test_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_matches_{i}_ppm.pkl')
        del(query_val)

        
        matches_same_ce = matches[matches['ceratio']==1]
        matches_dif_ce = matches[matches['ceratio']!=1]
        
        sub_val_same_ce = datasetBuilder.create_cleaned_df(
                                            matches_same_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_val_dif_ce = datasetBuilder.create_cleaned_df(
                                            matches_dif_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_val_same_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_val_dif_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_dif_ce_{i}_ppm.pkl')


        del(sub_val_same_ce)
        del(sub_val_dif_ce)

        #read in first bases and shuffle order
        query_test = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/third_query.pkl')
        query_test=query_test.sample(frac=1)

        #create matches for model to train on
        matches = datasetBuilder.create_matches_df(query_test,target_,i,max_matches,test_size, adduct_match)
        matches.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_matches_{i}_ppm.pkl')
        del(query_test)

        matches_same_ce = matches[matches['ceratio']==1]
        matches_dif_ce = matches[matches['ceratio']!=1]
        
        sub_test_same_ce = datasetBuilder.create_cleaned_df(
                                            matches_same_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )

        sub_test_dif_ce = datasetBuilder.create_cleaned_df(
                                            matches_dif_ce, 
                                            sim_methods, 
                                            noise_threshes, 
                                            centroid_tolerance_vals, 
                                            centroid_tolerance_types,
                                            powers,
                                            prec_removes
        )


        sub_test_same_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_same_ce_{i}_ppm.pkl')
        sub_test_dif_ce.to_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{i}_ppm.pkl')

        del(sub_test_same_ce)
        del(sub_test_dif_ce)
    # else:
    #     sub_train_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{i}_ppm.pkl')
    #     sub_val_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_same_ce_{i}_ppm.pkl')
    #     sub_test_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_same_ce_{i}_ppm.pkl')

    #     sub_train_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{i}_ppm.pkl')
    #     sub_val_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_dif_ce_{i}_ppm.pkl')
    #     sub_test_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{i}_ppm.pkl')


100013 rows created


Create Train/Val/Test Data & get Individual Results

In [None]:

for i in ppm_windows:

    sub_train_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{i}_ppm.pkl')
    sub_train_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{i}_ppm.pkl')
    
    train_datasets = [sub_train_same_ce, sub_train_dif_ce]
    dataset_names = ['same_ce','dif_ce']

    gbc_train_datasets = list()
    train_unnorm_dists_=list()
    ind_aucs_full = list()

    #create init df
    for metric in comparison_metrics:
        for j in range(int(train_datasets[0].shape[1]/2)):

            ind_aucs_full.append(f'{metric}_{j}')

    ind_aucs_full = pd.DataFrame(ind_aucs_full, columns=['metric'])

    for _ in range(len(train_datasets)):

        ind_aucs_=None
        train_data_gbcs = None
        train_unnorm_dists = None
        for j in range(int(train_datasets[_].shape[1]/2)):

            sub = train_datasets[_].iloc[:,2*j:2*(j+1)]
            old_cols = sub.columns
            sub.columns=['query','target']
            sub['match'] = train_datasets[_]['match'].tolist()

            ind_aucs, inds, inds_unnorm = testUtils.orig_metric_to_df(comparison_metrics, sub, unnnormalized=True)
            ind_aucs_ = pd.concat((ind_aucs_, ind_aucs))
            sub = sub.iloc[:,:2]
            sub.columns=old_cols
            train_data_gbcs = pd.concat((train_data_gbcs,inds), axis=1)
            train_unnorm_dists = pd.concat((train_unnorm_dists,inds_unnorm), axis=1)

        if _ ==0:    
            ind_aucs_full['same_train']=ind_aucs_['AUC'].tolist()
        else:
            ind_aucs_full['dif_train']=ind_aucs_['AUC'].tolist()

        train_data_gbcs['match'] = train_datasets[_]['match'].tolist()
        gbc_train_datasets.append(train_data_gbcs)
        train_unnorm_dists_.append(train_unnorm_dists)

    with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_train_{i}_ppm.pkl', 'wb') as handle:

        pickle.dump(gbc_train_datasets, handle)

    with open(f'{outputs_path}/intermediateOutputs/datasets/train_unnorm_dist_{i}_ppm.pkl', 'wb') as handle:

        pickle.dump(train_unnorm_dists, handle)

    del(train_unnorm_dists)
    del(gbc_train_datasets)
    del(sub_train_same_ce)
    del(sub_train_dif_ce)

    print('created train data')

    sub_val_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_same_ce_{i}_ppm.pkl')
    sub_val_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_dif_ce_{i}_ppm.pkl')
    val_datasets = [sub_val_same_ce, sub_val_dif_ce]
    
    gbc_val_datasets = list()
    for _ in range(len(val_datasets)):

        val_data_gbcs = None
        ind_aucs_=None
        for j in range(int(val_datasets[_].shape[1]/2)):

            sub = val_datasets[_].iloc[:,2*j:2*(j+1)]
            old_cols = sub.columns
            sub.columns=['query','target']
            sub['match'] = val_datasets[_]['match'].tolist()

            ind_aucs, inds = testUtils.orig_metric_to_df(comparison_metrics, sub)
            ind_aucs_ = pd.concat((ind_aucs_, ind_aucs))
            sub = sub.iloc[:,:2]
            sub.columns=old_cols
            val_data_gbcs = pd.concat((val_data_gbcs,inds), axis=1)
        
        if _ ==0:    
            ind_aucs_full['same_val']=ind_aucs_['AUC'].tolist()
        else:
            ind_aucs_full['dif_val']=ind_aucs_['AUC'].tolist()

        val_data_gbcs['match'] = val_datasets[_]['match'].tolist()
        gbc_val_datasets.append(val_data_gbcs)

    with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_val_{i}_ppm.pkl', 'wb') as handle:

        pickle.dump(gbc_val_datasets, handle)

    del(gbc_val_datasets)
    del(sub_val_same_ce)
    del(sub_val_dif_ce)
    print('created val data')

    sub_test_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_same_ce_{i}_ppm.pkl')
    sub_test_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{i}_ppm.pkl')
    test_datasets = [sub_test_same_ce, sub_test_dif_ce]

    gbc_test_datasets = list()
    test_unnorm_dists_ = list()
    for _ in range(len(test_datasets)):

        test_data_gbcs = None
        ind_aucs_ = None
        test_unnorm_dists=None
        for j in range(int(test_datasets[_].shape[1]/2)):

            sub = test_datasets[_].iloc[:,2*j:2*(j+1)]
            old_cols = sub.columns
            sub.columns=['query','target']
            sub['match'] = test_datasets[_]['match'].tolist()

            ind_aucs, inds, inds_unnorm = testUtils.orig_metric_to_df(comparison_metrics, sub, unnnormalized=True)
            ind_aucs['metric'] = [f'{x}_{j}' for x in ind_aucs['metric']]
            ind_aucs_ = pd.concat((ind_aucs_, ind_aucs))
            sub = sub.iloc[:,:2]
            sub.columns=old_cols
            test_data_gbcs = pd.concat((test_data_gbcs,inds), axis=1)
            test_unnorm_dists = pd.concat((test_unnorm_dists,inds_unnorm), axis=1)
        
        if _ ==0:    
            ind_aucs_full['same_test']=ind_aucs_['AUC'].tolist()
        else:
            ind_aucs_full['dif_test']=ind_aucs_['AUC'].tolist()

        test_data_gbcs['match'] = test_datasets[_]['match'].tolist()
        gbc_test_datasets.append(test_data_gbcs)
        test_unnorm_dists_.append(test_unnorm_dists)

    with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_test_{i}_ppm.pkl', 'wb') as handle:

        pickle.dump(gbc_test_datasets, handle)

    with open(f'{outputs_path}/intermediateOutputs/datasets/test_unnorm_dist_{i}_ppm.pkl', 'wb') as handle:

        pickle.dump(test_unnorm_dists, handle)

    del(test_unnorm_dists)
    del(gbc_test_datasets)
    del(sub_test_same_ce)
    del(sub_test_dif_ce)
    print('created test data')

Create indices to pull for each metric and those with same components, specify GBC models

In [None]:
with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_train_{ppm_windows[0]}_ppm.pkl', 'rb') as handle:

    gbc_train_datasets = pickle.load(handle)

    models = [
                    hgbc(),
                    hgbc(learning_rate=0.5),
                    hgbc(max_iter=200),
                    hgbc(learning_rate=0.01,min_samples_leaf=10),
                    hgbc(max_iter=200,min_samples_leaf=10),
                    hgbc(learning_rate=0.5, max_iter=200,min_samples_leaf=10),
                    ]


    indices = dict()
    indices['all-sims'] = list(range(gbc_train_datasets[0].shape[1]-1))
    indices['all-mults'] = list()
    indices['all-ents'] = list()
    indices['all-difs'] = list()

    for i in range(int((gbc_train_datasets[0].shape[1]-1)/len(comparison_metrics))):

        indices[f'setting-{i}-all'] = list(np.array(range(len(comparison_metrics)))+(i*len(comparison_metrics)))
        indices[f'mults-{i}'] = list(np.array([3,4,9])+(i*len(comparison_metrics)))
        indices[f'ents-{i}'] = list(np.array([0,11])+(i*len(comparison_metrics)))
        indices[f'difs-{i}'] = list(np.array([1,2,5,7,10])+(i*len(comparison_metrics)))

        indices[f'all-mults'] = indices['all-mults'] +list(np.array([3,4,9])+(i*len(comparison_metrics)))
        indices[f'all-ents'] =  indices['all-ents'] + list(np.array([0,11])+(i*len(comparison_metrics)))
        indices[f'all-difs'] =  indices['all-difs'] + list(np.array([1,2,5,7,10])+(i*len(comparison_metrics)))


    print(f' total number of models for each: {len(models) * len(indices)}')

Train Models and Collect Train Error

In [None]:
for window in ppm_windows:

    with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_train_{window}_ppm.pkl', 'rb') as handle:

        gbc_train_datasets = pickle.load(handle)
     
    same_train_model_aucs = list()
    model_names = list()
    dif_train_model_aucs = list()
    trained_models = dict()

    for key, value in indices.items():

        sub = gbc_train_datasets[0].iloc[:,value]
        models_ = copy.deepcopy(models)

        for i in range(len(models_)):

            models_[i].fit(sub,gbc_train_datasets[0]['match'])
            pos_ind = np.where(models_[i].classes_==1)[0][0]
            same_train_model_aucs.append(auc(gbc_train_datasets[0]['match'],models_[i].predict_proba(sub)[:,pos_ind]))
            model_names.append(f'{key}_{i}')
            trained_models[f'same_{key}_{i}'] = models_[i]

        sub = gbc_train_datasets[1].iloc[:,value]
        models_ = copy.deepcopy(models)

        for i in range(len(models_)):

            models_[i].fit(sub,gbc_train_datasets[1]['match'])
            pos_ind = np.where(models_[i].classes_==1)[0][0]
            dif_train_model_aucs.append(auc(gbc_train_datasets[1]['match'],models_[i].predict_proba(sub)[:,pos_ind]))
            trained_models[f'dif_{key}_{i}'] = models_[i]

    model_aucs = pd.DataFrame([model_names, same_train_model_aucs, dif_train_model_aucs]).transpose()
    model_aucs.columns = ['name','same_train','dif_train']

    del(gbc_train_datasets)

    with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_val_{window}_ppm.pkl', 'rb') as handle:

        gbc_val_datasets = pickle.load(handle)

    same_val = list()
    dif_val = list()
    same_test = list()
    dif_test = list()

    for name in model_aucs['name'].tolist():

        subset_name = name.split('_')[0]

        sub = gbc_val_datasets[0].iloc[:,indices[subset_name]]
        model = trained_models[f'same_{name}']
        pos_ind = np.where(model.classes_==1)[0][0]
        same_val.append(auc(gbc_val_datasets[0]['match'],model.predict_proba(sub)[:,pos_ind]))

        sub = gbc_val_datasets[1].iloc[:,indices[subset_name]]
        model = trained_models[f'dif_{name}']
        pos_ind = np.where(model.classes_==1)[0][0]
        dif_val.append(auc(gbc_val_datasets[1]['match'],model.predict_proba(sub)[:,pos_ind]))

    del(gbc_val_datasets)

    with open(f'{outputs_path}/intermediateOutputs/datasets/gbc_test_{window}_ppm.pkl', 'rb') as handle:

        gbc_test_datasets = pickle.load(handle)

    for name in model_aucs['name'].tolist():

        subset_name = name.split('_')[0]

        sub = gbc_test_datasets[0].iloc[:,indices[subset_name]]
        model = trained_models[f'same_{name}']
        pos_ind = np.where(model.classes_==1)[0][0]
        same_test.append(auc(gbc_test_datasets[0]['match'],model.predict_proba(sub)[:,pos_ind]))

        sub = gbc_test_datasets[1].iloc[:,indices[subset_name]]
        model = trained_models[f'dif_{name}']
        pos_ind = np.where(model.classes_==1)[0][0]
        dif_test.append(auc(gbc_test_datasets[1]['match'],model.predict_proba(sub)[:,pos_ind]))

    del(gbc_test_datasets)

    model_aucs['same_val'] = same_val
    model_aucs['dif_val'] = dif_val
    model_aucs['same_test'] = same_test
    model_aucs['dif_test'] = dif_test

    with open(f'{outputs_path}/intermediateOutputs/gbc_res/model_aucs_{window}_ppm.pkl', 'wb') as handle:

        pickle.dump(model_aucs,handle)
        del(model_aucs)

Train Functions to original metrics, evaluate how far off we are on test data with original normalization

In [None]:
inits = {'a' : 1,
        'b': 1,
        'c' : 1,
        'd' : -1,
        'e' : 1,
        'f' : 1,
        'g' :1,
        'h' :0,
        'i' : -1,
        'j' :1,
        'k' : 1,
        'l' : 1,
        'm' : 1,
        'n' : -1,
        'o' : 1,
        'p' : 1,
        'q' : 1,
        'r' : 1,
        's' : 0,
        't' : -1,
        'u' : 1,
        'v' : -1,
        'w' : 1,
        'x' : 1,
        'y' : 1,
        'z' : 1,
        'a_' : 1,
        'b_' : -1,
        'c_' : 1,
        'd_' : -1,
        'e_' : 1,
        'f_' : 1,
        'g_' : 1,
        'h_' : 1,
        'i_' : 1,
        'j_' : -1,
        'k_' : 1,
        'l_': -1,
        'm_' : 1,
        'n_' : 1,
        'o_' : 1,
        'p_' : 1,
        'q_' : -1,
        'r_' : 1,
        's_' : -1,
        't_' : 1,
        'u_' : 1,
        'v_' : 1,
        'w_' : -1,
        'x_' : 1,
        'y_' : -1,
        'z_':1}


fit_funcs = {
    'entropy-1':(['f','g','i','n_','p_'],None),
    'entropy-2':(['f','g','h','i','j','n_','o_','p_','q_','r_','s_'],None),
    'entropy-3':(['f','g','h','i','j','n_','o_','p_','q_','r_','s_','b','l','x','y','z','a_','b_','c_','d_','e_'],None),
    'lorentzian-1':(['a','b'],None),
    'lorentzian-2':(['a','b','c','d','e'],None),
    'lorentzian-3':(['a','b','c','d','e','f','g','h','i','j','k','n_','o_','p_','q_','r_','s_'],None),
    'dot_product-1':(['k','l','t_','u_'],None),
    'dot_product-2':(['k','l','m','n','o','t_','u_','v_','w_','x_','y_'],None),
    'dot_product-3':(['k','l','m','n','o','t_','u_','v_','w_','x_','y_','k','n_','o_','p_','q_','r_','s_'],None),
    'harmonic_mean-1':(['x','y','z','a_','b_','c_'],None),
    'harmonic_mean-2':(['b','l','x','y','z','a_','b_','c_','d_','e_','b','l'],None),
    'harmonic_mean-3':(['b','l','x','y','z','a_','b_','c_','d_','e_','b','l','n_','o_','p_','q_','r_','s_'],None),
    'fidelity-1':(['k','l','m'],None),
    'fidelity-2':(['k','l','m','n','o',],None),
    'fidelity-2':(['k','l','m','n','o','n_','o_','p_','q_','r_','s_'],None),
    'squared_chord-1':(['a','b'],None),
    'squared_chord-2':(['a','b','c','d','e'],None),
    'squared_chord-3':(['a','b','c','d','e','f','g','h','i','j','k','n_','o_','p_','q_','r_','s_'],None),
    'bhattacharya_1-1':(['a','b'],None),
    'bhattacharya_1-2':(['a','b','c','d','e'],None),
    'bhattacharya_1-3':(['a','b','c','d','e','f','g','h','i','j','k','n_','o_','p_','q_','r_','s_'],None),
}

reload(testUtils)
reload(func_ob)
reload(TunaSims)

train_reses = list()
test_reses = list()
for i in ppm_windows:

    with open(f'{outputs_path}/intermediateOutputs/datasets/train_unnorm_dist_{i}_ppm.pkl', 'rb') as handle:

        train_labels = pickle.load(handle)

    with open(f'{outputs_path}/intermediateOutputs/datasets/test_unnorm_dist_{i}_ppm.pkl', 'rb') as handle:

        test_labels = pickle.load(handle)

    with open(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{i}_ppm.pkl', 'rb') as handle:

        train_specs = pickle.load(handle)

    with open(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{i}_ppm.pkl', 'rb') as handle:

        test_specs = pickle.load(handle)

    #just focus on first setting for now
    train_labels = train_labels.iloc[:,:len(comparison_metrics)]
    train_specs = train_specs.iloc[:,:2]
    train_specs.columns=['query','target']

    test_labels = test_labels.iloc[:,:len(comparison_metrics)]
    test_specs = test_specs.iloc[:,:2]
    test_specs.columns=['query','target']

    squared_loss = lambda x: (x)**2
    lin_loss = lambda x: np.abs(x)
    l1_reg = lambda l,x: l*np.sum(np.abs(x))
    l2_reg = lambda l,x: l*np.sqrt(np.sum(x**2))
    no_reg = lambda x: 0

    reg_funcs = [no_reg,partial(l2_reg,0.01),partial(l2_reg,0.1)]
    reg_names = ['none','l2_0.01','l2_0.1']
    losses = [squared_loss]
    loss_names = ['squared']
    momentums = ['none']
    mom_weights = [[0.2,0.8]]
    lambdas = [0.01]
    max_iters = [1e4]

    funcs = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                        reg_names=reg_names,
                                        losses=losses,
                                        loss_names=loss_names,
                                        momentums=momentums,
                                        inits = inits,
                                        params=fit_funcs,
                                        mom_weights=mom_weights,
                                        lambdas=lambdas,
                                        max_iters=max_iters,
                                        func = TunaSims.tuna_combo_distance_demo)
    
    print(f'total number of functions : {len(funcs)}')
    trained=list()
    for func in funcs:

        name = func.name.split('-')[0]
        train_specs['match'] = train_labels[name]

        func.fit(train_specs)
        trained.append(func)
        print(func.name)

    #get train and test errors under proper normalization protocol
    trained_res=list()
    test_res=list()
    names=list()
    for func in trained:

        #generate proper train and test datasets
        name = func.name.split('-')[0]
        train_specs['match'] = train_labels[name]
        test_specs['match'] = test_labels[name]

        #get trained_func
        pred_func = func.trained_func()

        trained_res.append(testUtils.get_func_dist(train_specs, pred_func, name))
        test_res.append(testUtils.get_func_dist(test_specs, pred_func, name))
        names.append(func.name)
        

    trained_res = pd.DataFrame(trained_res).transpose()
    trained_res.columns  = names

    test_res = pd.DataFrame(test_res).transpose()
    test_res.columns  = names

    train_reses.append(trained_res)
    test_reses.append(test_res)

with open(f'{outputs_path}/intermediateOutputs/train_to_func/trained_reses_{i}_ppm.pkl', 'wb') as handle:

    pickle.dump(train_reses, handle)

with open(f'{outputs_path}/intermediateOutputs/train_to_func/test_reses_{i}_ppm.pkl', 'wb') as handle:

    pickle.dump(test_reses, handle)

del(train_reses)
del(test_reses)
del(train_labels)
del(test_labels)
del(train_specs)
del(test_specs)

Define Distance Functions by Features

In [None]:
flats = {
          'fdif_quadk':(['a','b','c','d','e'],None),
          'fadd_quadk':(['f','g','h','i','j'],None),
          'fmult_quadk':(['k','l','m','n','o'],None),
}

exts = {'edif_add':(['b','g','p','q','r','s','t','u','v','w'],None),
        'edif_mult':(['b','l','x','y','z','a_','b_','c_','d_','e_'],None),
        'emult_add':(['l','g','f_','g_','h_','i_','j_','k_','l_','m_'],None),      
}

params = dict()
seen =set()
for key in flats.keys():
    for key_ in flats.keys():

        feature_type = key.split('_')[0]
        feature_type_ = key_.split('_')[0]

        func_type = key.split('_')[1]
        func_type_ = key_.split('_')[1]

        try:
            bounds_type = key.split('_')[2]
            bounds_type_ = key_.split('_')[2]
        except:
            bounds_type = ''
            bounds_type_ = ''

        if f'{key_}_{key}' in params.keys():
            continue
        params[f'{key}_{key_}']=(sorted(list(set(flats[key][0]+flats[key_][0]))),testUtils.dict_combine(flats[key][1],flats[key_][1]))
        
params_ = dict()
seen =set()
for key in exts.keys():
    for key_ in exts.keys():

        feature_type = key.split('_')[0]
        feature_type_ = key_.split('_')[0]

        func_type = key.split('_')[1]
        func_type_ = key_.split('_')[1]

        try:
            bounds_type = key.split('_')[2]
            bounds_type_ = key_.split('_')[2]
        except:
            bounds_type = ''
            bounds_type_ = ''

        if f'{key_}_{key}' in params_.keys():
            continue
        params_[f'{key}_{key_}']=(sorted(list(set(exts[key][0]+exts[key_][0]))),testUtils.dict_combine(exts[key][1],exts[key_][1]))

params.update(params_)   

params['all_flat_quadk']= (['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'],None)
params['all_ext_quadk'] = (['b','l','g','p','q','r','s','t','u','v','w','x','y','z','a_','b_','c_','d_','e_','f_','g_','h_','i_','j_','k_','l_','m_','t_','u_','v_','w_','x_','y_'],None)

for key in list(params_.keys())[:5]:
    params[f'{key}_normed_add']=(sorted(list(set(params_[key][0]+['n_','o_','p_','q_','r_','s_']))),testUtils.dict_combine(params_[key][1],None))
    params[f'{key}_normed_mult']=(sorted(list(set(params_[key][0]+['t_','u_','v_','w_','x_','y_']))),testUtils.dict_combine(params_[key][1],None))

params['norm_only_add']=(['n_','o_','p_','q_','r_','s_'],None)
params['norm_only_mult']=(['t_','u_','v_','w_','x_','y_'],None)


flats = {
          'fdif_quad':(['a','b','c'],None),
          'fadd_quad':(['f','g','h'],None),
          'fmult_quad':(['k','l','m'],None),
}

exts = {'edif_add_quad':(['b','g','p','q','r','s','t','u'],None),
        'edif_mult_quad':(['b','l','x','y','z','a_','b_','c_'],None),
        'emult_add_quad':(['l','g','f_','g_','h_','i_','j_','k_'],None),      
}

params2 = dict()
seen =set()
for key in flats.keys():
    for key_ in flats.keys():

        feature_type = key.split('_')[0]
        feature_type_ = key_.split('_')[0]

        func_type = key.split('_')[1]
        func_type_ = key_.split('_')[1]

        try:
            bounds_type = key.split('_')[2]
            bounds_type_ = key_.split('_')[2]
        except:
            bounds_type = ''
            bounds_type_ = ''

        if f'{key_}_{key}' in params2.keys():
            continue
        params2[f'{key}_{key_}']=(sorted(list(set(flats[key][0]+flats[key_][0]))),testUtils.dict_combine(flats[key][1],flats[key_][1]))
        
params2_ = dict()
seen =set()
for key in exts.keys():
    for key_ in exts.keys():

        feature_type = key.split('_')[0]
        feature_type_ = key_.split('_')[0]

        func_type = key.split('_')[1]
        func_type_ = key_.split('_')[1]

        try:
            bounds_type = key.split('_')[2]
            bounds_type_ = key_.split('_')[2]
        except:
            bounds_type = ''
            bounds_type_ = ''

        if f'{key_}_{key}' in params2_.keys():
            continue
        params2_[f'{key}_{key_}']=(sorted(list(set(exts[key][0]+exts[key_][0]))),testUtils.dict_combine(exts[key][1],exts[key_][1]))

params2['all_flat_quad']= (['a','b','c','f','g','h','k','l','m'],None)
params2['all_ext_quad'] = (['b','l','g','p','q','r','s','t','u','x','y','z','a_','b_','c_','f_','g_','h_','i_','j_','k_'],None)


for key in list(params2_.keys())[:5]:
    params2[f'{key}_normed_add']=(sorted(list(set(params2_[key][0]+['n_','o_','p_','s_']))),testUtils.dict_combine(params2_[key][1],None))
    params2[f'{key}_normed_mult']=(sorted(list(set(params2_[key][0]+['t_','u_','v_','w_','x_','y_']))),testUtils.dict_combine(params2_[key][1],None))

params2.update(params2_) 
#params.update(params2)  

for key in list(params.keys())[:10]:
    params[f'{key}_sigtune']=(params[key][0]+['z_'],None)

reload(func_ob)
reload(TunaSims)
reload(testUtils)
#helper lambda funcs
squared_loss = lambda x: (1-x)**2
lin_loss = lambda x: np.abs(1-x)
l1_reg = lambda l,x: l*np.sum(np.abs(x))
l2_reg = lambda l,x: l*np.sqrt(np.sum(x**2))
no_reg = lambda x: 0

reg_funcs = [no_reg,partial(l2_reg,0.01),partial(l2_reg,0.1)]
reg_names = ['none_none','l2_0.01','l2_0.1']
losses = [squared_loss]
loss_names = ['squared']
momentums = ['none']
mom_weights = [[0.2,0.8]]
lambdas = [0.01]
max_iters = [1e4]

funcs_same = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                       reg_names=reg_names,
                                       losses=losses,
                                       loss_names=loss_names,
                                       momentums=momentums,
                                       params=params,
                                       inits=inits,
                                       mom_weights=mom_weights,
                                       lambdas=lambdas,
                                       max_iters=max_iters,
                                       func = TunaSims.tuna_combo_distance)

funcs_dif = testUtils.create_all_funcs_stoch(reg_funcs=reg_funcs,
                                       reg_names=reg_names,
                                       losses=losses,
                                       loss_names=loss_names,
                                       momentums=momentums,
                                       params=params,
                                       inits=inits,
                                       mom_weights=mom_weights,
                                       lambdas=lambdas,
                                       max_iters=max_iters,
                                       func = TunaSims.tuna_combo_distance)

all_funcs_ = [funcs_same, funcs_dif]

print(f'number of specifications: {len(funcs_same)}')


In [None]:
reload(TunaSims)
for window in ppm_windows:

    trained_dict = dict()
    all_funcs = copy.deepcopy(all_funcs_)

    sub_train_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_train_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{window}_ppm.pkl')
    
    train_datasets = [sub_train_same_ce, sub_train_dif_ce]
    dataset_names = ['same_ce','dif_ce']

    for _ in range(len(train_datasets)):
        
        for j in range(int(train_datasets[_].shape[1]/2)):

            sub = train_datasets[_].iloc[:,2*j:2*(j+1)]
            sub.columns=['query','target']
            sub['match'] = train_datasets[_]['match']
        
            trained=list()
            for i in range(len(all_funcs[_])):
                
                all_funcs[_][i].fit(sub)
                trained.append(all_funcs[_][i])
                if (i+1)%10==0:
                    print(f'trained {i+1} functions on {dataset_names[_]}_{j}')


            trained_dict[f'{dataset_names[_]}_{j}'] = trained

    with open(f'{outputs_path}/intermediateOutputs/train_to_error/trained_dict_{window}_ppm.pkl', 'wb') as handle:

        pickle.dump(trained_dict, handle)
        del(trained_dict)
    

    

Get AUCs

In [None]:
test_datasets = [sub_test_same_ce, sub_test_dif_ce]

for window in ppm_windows:
    
    sub_train_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_train_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/train_cleaned_matches_dif_ce_{window}_ppm.pkl')
    
    train_datasets = [sub_train_same_ce, sub_train_dif_ce]
    dataset_names = ['same_ce','dif_ce']

    trained_res=None
    for _ in range(len(train_datasets)):
        for j in range(int(train_datasets[_].shape[1]/2)):

            #grab trained models for this portion of dataframe
            models = trained_dict[f'{dataset_names[_]}_{j}']
            sub = train_datasets[_].iloc[:,2*j:2*(j+1)]
            sub.columns=['query','target']
            sub['match'] = train_datasets[_]['match'].tolist()

            small = testUtils.trained_res_to_df(models,sub)
            small.insert(1,'settings', f'{dataset_names[_]}_{j}')
            trained_res=pd.concat((trained_res,small))
            print(f'completed {dataset_names[_]}_{j}')

    print('generated train results')
    del(sub_train_dif_ce)
    del(sub_train_same_ce)

    sub_val_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_val_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/val_cleaned_matches_dif_ce_{window}_ppm.pkl')
    val_datasets = [sub_val_same_ce, sub_val_dif_ce]

    val_aucs=list()
    for _ in range(len(val_datasets)):
        for j in range(int(val_datasets[_].shape[1]/2)):

            #grab trained models for this portion of dataframe
            models = trained_dict[f'{dataset_names[_]}_{j}']
            sub = val_datasets[_].iloc[:,2*j:2*(j+1)]
            sub.columns=['query','target']
            sub['match'] = val_datasets[_]['match'].tolist()
            val_aucs = val_aucs + testUtils.trained_res_to_df(models,sub)['auc'].tolist()
            print(f'completed {dataset_names[_]}_{j}')

    trained_res['val']=val_aucs
    print('generated val results')

    del(sub_val_dif_ce)
    del(sub_val_same_ce)

    sub_test_same_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_same_ce_{window}_ppm.pkl')
    sub_test_dif_ce = pd.read_pickle(f'{outputs_path}/intermediateOutputs/splitMatches/test_cleaned_matches_dif_ce_{window}_ppm.pkl')
    test_datasets = [sub_test_same_ce, sub_test_dif_ce]
    
    test_aucs=list()
    for _ in range(len(test_datasets)):
        for j in range(int(test_datasets[_].shape[1]/2)):

            #grab trained models for this portion of dataframe
            models = trained_dict[f'{dataset_names[_]}_{j}']
            sub = test_datasets[_].iloc[:,2*j:2*(j+1)]
            sub.columns=['query','target']
            sub['match'] = test_datasets[_]['match'].tolist()
            test_aucs = test_aucs + testUtils.trained_res_to_df(models,sub)['auc'].tolist()
            print(f'completed {dataset_names[_]}_{j}')

    trained_res['test']=test_aucs
    print('generated test results')

    del(sub_test_dif_ce)
    del(sub_test_same_ce)

    with open(f'{outputs_path}/intermediateOutputs/train_to_error/trained_res_{window}_ppm.pkl', 'wb') as handle:

        pickle.dump(trained_res, handle)
        del(trained_res)

    

    



Conclusions: 

add offsets for terms

num of params not appearing to change train time much

consider replacing knockouts with sigmoids

consider tuning final sigmoid

should features like length,entropy be included in the similarity, or be used outside as extra feature in learned mod.both? neither?


Other Ideas:

Accuracy (In order of increasing difficulty):

-Incorporate as feature how many possible chem structures (can also restrict to NPS) exist within a certain precursor distance. (violating golden rules or not)

-include original NIST version or theoretical res as feature

-Weight different ranges of spec differently for matches (more diversity/greater accuracy)

-smush together top n results over different inchicores and come up with combined model predicting over individual inchicores

-diagnostic ion/loss classing as a feature...do they match

-kernelized smooth match

-3d struct guesses...do they match (cores, but can generalize to 3d)

Speed(In order of increasing difficulty):

-combine sim metrics and expand(apply func to df)

-exclude matches based on non-similarity features to cut down on needed comparisons

-ion tables to upper bound similarity

-only use one peak consolidation and matching protocol...then only do reweight transformations on already matched peaks for spec and sim features

-can missing peaks in lower energy be explained by frags and losses from higher energy? incorporate into model

Order to proceed:

-recreate databases with coll energy included (standardized format across DBs)

-what proportion of matches are the same coll energy?

-quantify variability in peak appearance vs peak intensity across collision energies
    -does this relate in a predictable way to fragment mass

-test sim metrics for same coll energy vs not same col energy (is the same inductive bias useful)

-Show that regular funcs are in the space of combo distance

-test combining individual metrics that use different components of the 2 vectors (add, mult, dif)

-range over individual metrics in combined score in attempt to explain why combining them is successful

-train combo metrics with flattened components and individual (should these sims be broken out?)
    -should we do this for same coll energy vs dif energies

-are different combo metrics put into larger model more successful than the combined individual metrics

-can tunasims be fit with nonlinearities between the components (flattened or not?)