In [None]:
import numpy as np
import pandas as pd
from functools import partial
from importlib import reload
from collections import Counter
import copy
import matplotlib.pyplot as plt

from TunaSims import tuna_sim
import funcOb
import math_distance
import tools

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

Manhattan Similarity Demo

In [None]:
demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/metlinGnps_NIST20_matchedPol/intermediateOutputs/splitMatches/train/10_ppm/chunk_1.pkl')
demo_matches_test = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/metlinGnps_NIST20_matchedPol/intermediateOutputs/splitMatches/train/10_ppm/chunk_2.pkl')
demo_query = demo_matches.iloc[0]['query']
demo_target = demo_matches.iloc[0]['target']
demo_query_prec = demo_matches.iloc[0]['precquery']
demo_target_prec = demo_matches.iloc[0]['prectarget']

In [None]:
manhattan_tuna = tuna_sim(demo_query,
              demo_target,
              demo_query_prec,
              demo_target_prec,
              dif_a = 1,
              dif_b = 1,
              unnormed = 1)

dot_tuna = tuna_sim(demo_query,
              demo_target,
              demo_query_prec,
              demo_target_prec,
              mult_a = 1,
              mult_b = 2,
              collapsed = 1,
              mult_norm_a= 1,
              mult_norm_b= 2,
              sim_flip=True)

harmonic_tuna = tuna_sim(demo_query,
              demo_target,
              demo_query_prec,
              demo_target_prec,
              mult_a = 1,
              mult_b = 1,
              expanded = 2,
              add_norm_a= 1,
              add_norm_b= 1,
              sim_flip=True)

demo_query[:,1] /= sum(demo_query[:,1])
demo_target[:,1] /= sum(demo_target[:,1])
combined_old = tools.match_peaks_in_spectra(demo_query, demo_target, ms2_da=0.05)
manhattan = 1 - tools.sigmoid(math_distance.manhattan_distance(combined_old[:,1], combined_old[:,2]))
dot_product = tools.sigmoid(1 - math_distance.dot_product_nosqrt_distance(combined_old[:,1], combined_old[:,2]))
harmonic_mean = tools.sigmoid(1 - math_distance.harmonic_mean_distance(combined_old[:,1], combined_old[:,2]))

print(f'manhattan: {abs(manhattan - manhattan_tuna)}')
print(f'dot_product: {abs(dot_product - dot_tuna)}')
print(f'harmonic_mean: {abs(harmonic_mean - harmonic_tuna)}')

Can we recover similarity function from scores and input vectors alone, which training strategies are best?

In [None]:
def func_err_tester(base_objects, test_params, datasets , logpath=None):
    """ 
    base objects: func_obs pproperly named
    test_params: dict with key: name value: params to be fit on
    datasets: "dict key: name value: tuple of train and test
    """

    results = list()
    for object in base_objects:

        for name, params in test_params.items():

            for dataset_name, (train, test) in datasets.items():

                #don't train the original
                train_func = copy.deepcopy(object)

                #intiailize proper values and train
                train_func.params = params
                train_func.init_vals = np.zeros(len(params)) + 0.5
                train_func.fit(train)

                fitted_func = train_func.trained_func()

                train_estimates = np.zeros(len(train))
                for i in range(len(train)):

                    train_estimates[i] = fitted_func(train.iloc[i]['query'],
                                                    train.iloc[i]['target'],
                                                    train.iloc[i]['precquery'],
                                                    train.iloc[i]['prectarget'])
                    
                test_estimates = np.zeros(len(test))
                for i in range(len(test)):

                    test_estimates[i] = fitted_func(test.iloc[i]['query'],
                                                    test.iloc[i]['target'],
                                                    test.iloc[i]['precquery'],
                                                    test.iloc[i]['prectarget'])
                    
                results.append([object.name, 
                               name,
                               train_func.init_vals,
                               dataset_name,
                               np.mean(abs(train_estimates - train['match'].to_numpy())),
                               np.mean(abs(test_estimates - test['match'].to_numpy())),
                               np.mean(abs(test_estimates - train['match'].to_numpy()))])
                
                if logpath is not None:
                    with open(logpath, 'a') as handle:
                        handle.write(f'''{[object.name, 
                                        name, 
                                        train_func.init_vals,
                                        dataset_name,
                                        np.mean(abs(train_estimates - train['match'].to_numpy())),
                                        np.mean(abs(test_estimates - test['match'].to_numpy())),
                                        np.mean(abs(test_estimates - train['match'].to_numpy()))]} \n''')

    return pd.DataFrame(results, columns = ['name', 'params', 'trained_values', 'metric', 'train_err', 'test_err', 'range_control'])

                    


In [None]:
func_skeletons = [
    funcOb.func_ob(
    name = "base_1k_iter",
    sim_func = partial(tuna_sim),
    init_vals= [1,1],
    params = [1,1],
    tol = 0,
    lambdas= 1,
    max_iter = 2000,
    epsilon = 1e-5),
    # funcOb.func_ob(
    # name = "base_10k_iter",
    # sim_func = partial(tuna_sim),
    # init_vals= [1,1],
    # params = [1,1],
    # tol = 0,
    # lambdas= 1,
    # max_iter = 10000,
    # epsilon = 1e-5),
]

params = {
    "dif_only": ['unnormed','dif_a','dif_b','sim_flip'],
    # "dif_and_mult": ['unnormed','dif_a','dif_b','mult_a','mult_b', 'sim_flip'],
    "collapsed": ['collapsed','dif_a','dif_b','mult_a','mult_b', 'mult_norm_a','mult_norm_b', 'sim_flip'],
    "expanded": ['expanded','dif_a','dif_b','mult_a','mult_b', 'add_norm_a', 'add_norm_b', 'sim_flip'],
    # "collapsed_and_unnorm": ['unnormed', 'collapsed','dif_a','dif_b','mult_a','mult_b','mult_norm_a','mult_norm_b', 'add_norm_a', 'add_norm_b', 'sim_flip'],
    # "expanded_and_unnorm": ['unnormed', 'expanded','dif_a','dif_b','mult_a','mult_b', 'add_norm_a', 'add_norm_b', 'sim_flip'],  
}

new_dict = dict()
for key, value in params.items():
    new_dict[f'{key}_cleaning'] = value+['query_max_mz_fix',
                                               'target_max_mz_fix', 
                                               'query_fixed_noise', 
                                               'target_fixed_noise',
                                                'query_da_thresh',
                                                'target_da_thresh',
                                                'query_fixed_power',
                                                'target_fixed_power']
    
# params.update(new_dict)

In [None]:
datasets = dict()

demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/metlinGnps_NIST20_matchedPol/intermediateOutputs/splitMatches/train/10_ppm/chunk_1.pkl')
demo_matches_test = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/metlinGnps_NIST20_matchedPol/intermediateOutputs/splitMatches/test/10_ppm/chunk_1.pkl')

demo_matches = demo_matches.sample(frac=1)[:10000]
demo_matches_test = demo_matches_test.sample(frac=1)[:10000]

sims = np.zeros(len(demo_matches))
for i in range(len(demo_matches)):

    q = demo_matches.iloc[i]['query']
    t = demo_matches.iloc[i]['target']

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims[i] = 1- tools.sigmoid(math_distance.manhattan_distance(combined[:,1], combined[:,2]))

demo_matches['match'] = sims
demo_matches['match'].fillna(0, inplace = True)

sims_test = np.zeros(len(demo_matches_test))
for i in range(len(demo_matches_test)):

    q = demo_matches_test.iloc[i]['query']
    t = demo_matches_test.iloc[i]['target']

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims_test[i] = 1- tools.sigmoid(math_distance.manhattan_distance(combined[:,1], combined[:,2]))

demo_matches_test['match'] = sims_test
demo_matches_test['match'].fillna(0, inplace = True)

datasets['manhattan'] = (demo_matches[['query','target','precquery','prectarget','match']], demo_matches_test[['query','target','precquery','prectarget','match']])

sims = np.zeros(len(demo_matches))
for i in range(len(demo_matches)):

    q = demo_matches.iloc[i]['query']
    t = demo_matches.iloc[i]['target']

    q = tools.tuna_clean_spectrum(q,
                                  max_mz=demo_matches.iloc[i]['precquery']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    t = tools.tuna_clean_spectrum(t,
                                  max_mz=demo_matches.iloc[i]['prectarget']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    q[:,1] = tools.tuna_weight_intensity(q, fixed_exp = 0.75)
    t[:,1] = tools.tuna_weight_intensity(t, fixed_exp = 0.75)

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims[i] = 1- tools.sigmoid(math_distance.manhattan_distance(combined[:,1], combined[:,2]))

demo_matches['match'] = sims
demo_matches['match'].fillna(0, inplace=True)

sims_test = np.zeros(len(demo_matches_test))
for i in range(len(demo_matches_test)):

    q = demo_matches_test.iloc[i]['query']
    t = demo_matches_test.iloc[i]['target']

    q = tools.tuna_clean_spectrum(q,
                                  max_mz=demo_matches_test.iloc[i]['precquery']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    t = tools.tuna_clean_spectrum(t,
                                  max_mz=demo_matches_test.iloc[i]['prectarget']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    q[:,1] = tools.tuna_weight_intensity(q, fixed_exp = 0.75)
    t[:,1] = tools.tuna_weight_intensity(t, fixed_exp = 0.75)

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims_test[i] = 1- tools.sigmoid(math_distance.manhattan_distance(combined[:,1], combined[:,2]))

demo_matches_test['match'] = sims_test
demo_matches_test['match'].fillna(0, inplace = True)

datasets['manhattan_clean'] = (demo_matches[['query','target','precquery','prectarget','match']], demo_matches_test[['query','target','precquery','prectarget','match']])

sims = np.zeros(len(demo_matches))
for i in range(len(demo_matches)):

    q = demo_matches.iloc[i]['query']
    t = demo_matches.iloc[i]['target']

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims[i] = 1- tools.sigmoid(math_distance.dot_product_distance(combined[:,1], combined[:,2]))

demo_matches['match'] = sims
demo_matches['match'].fillna(0, inplace=True)

sims_test = np.zeros(len(demo_matches_test))
for i in range(len(demo_matches_test)):

    q = demo_matches_test.iloc[i]['query']
    t = demo_matches_test.iloc[i]['target']

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims_test[i] = 1- tools.sigmoid(math_distance.dot_product_distance(combined[:,1], combined[:,2]))

demo_matches_test['match'] = sims_test
demo_matches_test['match'].fillna(0, inplace = True)

datasets['dot_product'] = (demo_matches[['query','target','precquery','prectarget','match']], demo_matches_test[['query','target','precquery','prectarget','match']])

sims = np.zeros(len(demo_matches))
for i in range(len(demo_matches)):

    q = demo_matches.iloc[i]['query']
    t = demo_matches.iloc[i]['target']

    q = tools.tuna_clean_spectrum(q,
                                  max_mz=demo_matches.iloc[i]['precquery']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    t = tools.tuna_clean_spectrum(t,
                                  max_mz=demo_matches.iloc[i]['prectarget']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    q[:,1] = tools.tuna_weight_intensity(q, fixed_exp = 0.75)
    t[:,1] = tools.tuna_weight_intensity(t, fixed_exp = 0.75)

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims[i] = 1- tools.sigmoid(math_distance.dot_product_distance(combined[:,1], combined[:,2]))

demo_matches['match'] = sims
demo_matches['match'].fillna(0, inplace = True)

sims_test = np.zeros(len(demo_matches_test))
for i in range(len(demo_matches_test)):

    q = demo_matches_test.iloc[i]['query']
    t = demo_matches_test.iloc[i]['target']

    q = tools.tuna_clean_spectrum(q,
                                  max_mz=demo_matches_test.iloc[i]['precquery']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    t = tools.tuna_clean_spectrum(t,
                                  max_mz=demo_matches_test.iloc[i]['prectarget']-1.6,
                                  ms2_da = 0.05,
                                  noise_removal_fixed = 0.01,
                                  noise_removal_var=0)
    
    q[:,1] = tools.tuna_weight_intensity(q, fixed_exp = 0.75)
    t[:,1] = tools.tuna_weight_intensity(t, fixed_exp = 0.75)

    q[:,1] /= sum(q[:,1])
    t[:,1] /= sum(t[:,1])
    combined = tools.match_peaks_in_spectra(q, t, ms2_da=0.05)
    sims_test[i] = 1- tools.sigmoid(math_distance.dot_product_distance(combined[:,1], combined[:,2]))

demo_matches_test['match'] = sims_test
demo_matches_test['match'].fillna(0, inplace = True)

datasets['dotprod_clean'] = (demo_matches[['query','target','precquery','prectarget','match']], demo_matches_test[['query','target','precquery','prectarget','match']])





In [None]:
res = func_err_tester(func_skeletons,
                params,
                datasets,
                logpath = "/Users/jonahpoczobutt/projects/TunaRes/log_2.txt")

In [None]:
res

Debug Zone

In [None]:
params = ['unnormed', 'dif_a', 'dif_b', 'sim_flip']
init_vals= [0.5 for i in range(len(params))]

reload(funcOb)

debug_func = funcOb.func_ob(
    name = "base_1k_iter",
    sim_func = partial(tuna_sim),
    init_vals= init_vals,
    params = params,
    tol = 0,
    lambdas= 1,
    max_iter = 1000,
    epsilon = 1e-5)

In [None]:
debug_func.fit(datasets['manhattan'][0])

In [None]:
debug_func.grad

In [None]:
debug_func.init_vals

In [None]:
debug_func.trained_func()(datasets['manhattan'][0].iloc[0]['query'],
                        datasets['manhattan'][0].iloc[0]['target'],
                        datasets['manhattan'][0].iloc[0]['precquery'],
                        datasets['manhattan'][0].iloc[0]['prectarget'])

In [None]:

reload(funcOb)

debug_func_ = funcOb.func_ob(
    name = "base_1k_iter",
    sim_func = partial(tuna_sim),
    init_vals= debug_func.init_vals,
    params = debug_func.params,
    tol = 0,
    lambdas= 1,
    max_iter = 1,
    epsilon = 1e-5)

debug_func_.fit(datasets['manhattan'][0])