In [1]:
import TunaSims
import numpy as np
from funcTrainer import specSimTrainer
import pandas as pd
import datasetBuilder
import tools_fast
from sklearn.metrics import roc_auc_score
import scipy
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
import time
import copy
import pickle
from itertools import combinations

In [2]:

def _weight_intensity_by_entropy(x):
    WEIGHT_START = 0.25
    ENTROPY_CUTOFF = 3
    weight_slope = (1 - WEIGHT_START) / ENTROPY_CUTOFF

    if np.sum(x) > 0:
        entropy_x = scipy.stats.entropy(x)
        if entropy_x < ENTROPY_CUTOFF:
            weight = WEIGHT_START + weight_slope * entropy_x
            x = np.power(x, weight)
            x_sum = np.sum(x)
            x = x / x_sum
    return x

def ppm(base, ppm):
    """
    convert ppm threshold to dalton based on precursor exact mass (base)
    """

    return base * (ppm / 1e6)



In [3]:
def harmonic_mean_distance(p, q):
    r"""
    Harmonic mean distance:

    .. math::

        1-2\sum(\frac{P_{i}Q_{i}}{P_{i}+Q_{i}})
    """
    p = _weight_intensity_by_entropy(p)
    q = _weight_intensity_by_entropy(q)
    return 2 * np.sum(p * q / (p + q))

def lorentzian_distance(p, q):
    r"""
    Lorentzian distance:

    .. math::

        \sum{\ln(1+|P_i-Q_i|)}
    """
    p = _weight_intensity_by_entropy(p)
    q = _weight_intensity_by_entropy(q)
    return 1 - np.sum(np.log(1 + np.abs(p - q)))

def matusita_distance(p, q):
    r"""
    Matusita distance:

    .. math::

        \sqrt{\sum(\sqrt{P_{i}}-\sqrt{Q_{i}})^2}
    """
    p = _weight_intensity_by_entropy(p)
    q = _weight_intensity_by_entropy(q)
    return 1- np.sum(np.power(np.sqrt(p) - np.sqrt(q), 2))

def probabilistic_symmetric_chi_squared_distance(p, q):
    r"""
    Probabilistic symmetric χ2 distance:

    .. math::

        \frac{1}{2} \times \sum\frac{(P_{i}-Q_{i}\ )^2}{P_{i}+Q_{i}\ }
    """
    p = _weight_intensity_by_entropy(p)
    q = _weight_intensity_by_entropy(q)
    return 1- (1 / 2 * np.sum(np.power(p - q, 2) / (p + q)))

def entropy_distance(p, q):
    r"""
    Unweighted entropy distance:

    .. math::

        -\frac{2\times S_{PQ}-S_P-S_Q} {ln(4)}, S_I=\sum_{i} {I_i ln(I_i)}
    """
    p = _weight_intensity_by_entropy(p)
    q = _weight_intensity_by_entropy(q)
    merged = p + q
    entropy_increase = 2 * \
                       scipy.stats.entropy(merged) - scipy.stats.entropy(p) - \
                       scipy.stats.entropy(q)
    
    return 1 - entropy_increase

def dot_product_distance(p, q):
    r"""
    Dot product distance:

    .. math::

        1 - \sqrt{\frac{(\sum{Q_iP_i})^2}{\sum{Q_i^2\sum P_i^2}}}
    """
    p = _weight_intensity_by_entropy(p)
    q = _weight_intensity_by_entropy(q)    
    score = np.power(np.sum(q * p), 2) / (
        np.sum(np.power(q, 2)) * np.sum(np.power(p, 2))
    )
    return np.sqrt(score)

def sigmoid(z):
    
        return 1/(1 + np.exp(-z))


Create Matches DFs

In [4]:
create_new_dataset = False

            

Create old similarities

In [5]:
sims_output_dir = '/Users/jonahpoczobutt/projects/TunaRes/oldSimRes'
if create_new_dataset:

     demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/Nist20_inputs/train/chunk_1.pkl')
     demo_matches_val = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/Nist20_inputs/val/chunk_1.pkl')

     queries = list()
     targets = list()
     indices = list()
     for i in range(len(demo_matches)):

          query = demo_matches.iloc[i]['query'][demo_matches.iloc[i]['query'][:,0] < demo_matches.iloc[i]['precquery'] - ppm(demo_matches.iloc[i]['precquery'],3)]
          target = demo_matches.iloc[i]['target'][demo_matches.iloc[i]['target'][:,0] < demo_matches.iloc[i]['prectarget'] - ppm(demo_matches.iloc[i]['prectarget'],3)]

          if len(query) > 0 and len(target) > 0:
               indices.append(i)
               queries.append(query)
               targets.append(target)

     demo_matches = demo_matches.iloc[indices]
     demo_matches['query'] = queries
     demo_matches['target'] = targets

     queries = list()
     targets = list()
     indices = list()
     for i in range(len(demo_matches_val)):

          query = demo_matches_val.iloc[i]['query'][demo_matches_val.iloc[i]['query'][:,0] < demo_matches_val.iloc[i]['precquery'] - ppm(demo_matches_val.iloc[i]['precquery'],3)]
          target = demo_matches_val.iloc[i]['target'][demo_matches_val.iloc[i]['target'][:,0] < demo_matches_val.iloc[i]['prectarget'] - ppm(demo_matches_val.iloc[i]['prectarget'],3)]

          if len(query) > 0 and len(target) > 0:
               indices.append(i)
               queries.append(query)
               targets.append(target)

     demo_matches_val = demo_matches_val.iloc[indices]
     demo_matches_val['query'] = queries
     demo_matches_val['target'] = targets

     demo_matches.to_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_matches_no_prec.pkl')
     demo_matches_val.to_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_matches_val_no_prec.pkl')

     sim_names = ['prob','matusita','entropy','dot','lorentzian','harmonic']
     distances = [probabilistic_symmetric_chi_squared_distance,
               matusita_distance,
               entropy_distance,
               dot_product_distance,
               lorentzian_distance,
               harmonic_mean_distance]

     for _ in range(len(sim_names)):

          matched_scores_val = list()
          for i in range(len(demo_matches_val)):
          
               matched = tools_fast.match_spectrum(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], ms2_da = 0.05)
               matched_scores_val.append(sigmoid(distances[_](matched[:,1]/sum(matched[:,1]), matched[:,2]/sum(matched[:,2]))))

          np.save(f'{sims_output_dir}/val_{sim_names[_]}.npy', np.array(matched_scores_val))

          matched_scores = list()
          for i in range(len(demo_matches)):

               matched = tools_fast.match_spectrum(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], ms2_da = 0.05)
               matched_scores.append(sigmoid(distances[_](matched[:,1]/sum(matched[:,1]), matched[:,2]/sum(matched[:,2]))))

          np.save(f'{sims_output_dir}/train_{sim_names[_]}.npy', np.array(matched_scores))


In [6]:
init_vals = {
    'mult_a' : 0.001,
    'mult_b': 1,
    'dif_a': 0.001,
    'dif_b':1,
    'add_norm_b' : 1,
    'target_intensity_a': 0.1,
    'query_intensity_a': 0.1,
    'target_intensity_b': 0.1,
    'query_intensity_b': 0.1,
    }

init_vals_2 = {
    'mult_a' : 0.001,
    'mult_b': 1,
    'dif_a': 0.001,
    'dif_b':1,
    'add_norm_a': 1,
    'add_norm_b' : 1,
    'target_intensity_a': 0.1,
    'query_intensity_a': 0.1,
    'target_intensity_b': 0.1,
    'query_intensity_b': 0.1,
    }

init_vals_3 = {
    'mult_a' : 0.001,
    'mult_b': 1,
    'dif_a': 0.001,
    'dif_b':1,
    'add_norm_int': 0,
    'add_norm_a': 1,
    'add_norm_b' : 1,
    'target_intensity_a': 0.1,
    'query_intensity_a': 0.1,
    'target_intensity_b': 0.1,
    'query_intensity_b': 0.1,
    }

regularization_grad = lambda x: 0.

fixed_vals = {'sigmoid_score' : True, 
              'weight_combine': 'multiply'
    }

fixed_vals = {}

bounds = {'add_norm_b': (0, 2),
          'mult_add_norm_b': (0, 2),
          'dif_add_norm_b': (0, 2),
          'mult_b': (1e-10, 2),
          'add_norm_a': (1e-10, 3),
          'dif_b': (1e-10, 2),
          'dif_a':(-3,3),
          'mult_a': (-3,3),
          'add_norm_int': (0, 3),
          'target_normalized_intensity_int': (-0.2,1),
          'query_normalized_intensity_int': (-0.2,1),
          'target_normalized_intensity_a': (1e-10,2),
          'query_normalized_intensity_a': (1e-10,2),
          'target_normalized_intensity_b': (0,2),
          'query_normalized_intensity_b': (0,2),
          'target_normalized_intensity_c': (-2,2),
          'query_normalized_intensity_c': (-2,2),
          'target_mz_b': (-2,2),
          'query_mz_b': (-2,2),
          'target_mz_a': (-2,2),
          'query_mz_a': (-2,2),
          'target_mz_int': (-0.2,1),
          'query_mz_int': (-0.2,1),
          'target_mz_c': (-2,2),
          'query_mz_c': (-2,2),
           'target_intensity_int': (-0.2,1),
           'query_intensity_int': (-0.2,1),
          'target_intensity_a': (1e-10,2),
          'query_intensity_a': (1e-10,2),
          'target_intensity_b': (1e-10,2),
          'query_intensity_b': (1e-10,2),
          'target_intensity_c': (1e-10,2),
          'query_intensity_c': (1e-10,2),
          }


init_names = ['b', 'ab', 'abint']
inits = [init_vals, init_vals_2, init_vals_3]
ad_params = [(0.98,0.025)]
func_obs = list()

for i in range(1):
    for momentum in [None]:
        for sched in [None]:
            for i in range(len(inits)):
                for ad_param in ad_params:
                
                    # func_obs.append(func_ob(f'{momentum}_{sched}_{init_names[i]}_{ad_param}',
                    #             sim_func = TunaSims.ExpandedTuna,
                    #             init_vals = inits[i].copy(),
                    #             fixed_vals = fixed_vals,
                    #             regularization_grad = regularization_grad,
                    #             bounds = bounds,
                    #             max_iter = 1e6,
                    #             learning_rates = 0.001,
                    #             momentum_type = momentum,
                    #             learning_rate_scheduler = sched,
                    #             learning_beta = 0.5,
                    #             momentum_beta = 0.3,
                    #             tol = 0,
                    #             balance_classes = True,
                    #             groupby_column = 'queryID_target_base',
                    #             ad_int = ad_param[0],
                    #             ad_slope= ad_param[1]))
                    
                    func_obs.append(specSimTrainer(f'{momentum}_{sched}_{init_names[i]}_{ad_param}',
                                init_vals = inits[i].copy(),
                                fixed_vals = fixed_vals,
                                bounds = bounds,
                                max_iter = 1e6,
                                learning_rates = 0.001,
                                learning_rate_scheduler = sched,
                                learning_beta = 0.5,
                                balance_column= 'score',
                                groupby_column = 'queryID_target_base',
                                ad_int = ad_param[0],
                                ad_slope= ad_param[1]))
                
print(len(func_obs))

3


In [7]:
demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_matches_no_prec.pkl')
demo_matches_val = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_matches_val_no_prec.pkl')
#demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_lite.pkl')

demo_matches['score'] = 1 * demo_matches['InchiCoreMatch']
demo_matches['queryID_target_base'] = demo_matches['queryID'].astype(str) + '_' + demo_matches['target_base'].astype(str)
demo_matches_val['queryID_target_base'] = demo_matches_val['queryID'].astype(str) + '_' + demo_matches_val['target_base'].astype(str)

demo_matches_val['score'] = 1 * demo_matches_val['InchiCoreMatch']

train_auc_top = {i.name: list() for i in func_obs}
val_auc_top = {i.name: list() for i in func_obs}

train_auc_all = {i.name: list() for i in func_obs}
val_auc_all = {i.name: list() for i in func_obs}

train_times = {i.name: list() for i in func_obs}

absolutes = [0, 3e5]
offsets = [absolutes[i+1] - absolutes[i] for i in range(len(absolutes)-1)]

reps = 5

trained_obs = []

for model in func_obs:

    for _ in range(reps):

        model_ = copy.deepcopy(model)

        accumulated = 0
        accumulated_time = 0
        train_aucs_top = list()
        val_aucs_top = list()
        train_aucs_all = list()
        val_aucs_all = list()
        trained_obs_sub = list()

        for i in offsets:
            
            model_.max_iter = i

            keys = model_.sim_func.grad_names
            inits = {keys[i]: np.random.uniform(model_.bounds[i][0], model_.bounds[i][1]) for i in range(len(keys))}
            model_.sim_func = model_.sim_func(**inits)
            
            print('start')
            start = time.time()
            model_.fit(demo_matches)
            accumulated_time += time.time() - start

            print(f'done training: {round((accumulated_time)/60, 4)}')

            demo_matches['preds'] = model_.sim_func.predict_for_dataset(demo_matches)
            demo_matches_val['preds'] = model_.sim_func.predict_for_dataset(demo_matches_val)

            train_aucs_all.append(round(roc_auc_score(demo_matches['score'] , demo_matches['preds']), 4)) 
            val_aucs_all.append(round(roc_auc_score(demo_matches_val['score'] , demo_matches_val['preds']),4))


            temp = demo_matches[['queryID_target_base','preds','score']].groupby(by=['queryID_target_base']).max()
            temp_val = demo_matches_val[['queryID_target_base','preds','score']].groupby(by=['queryID_target_base']).max()

            train_aucs_top.append(round(roc_auc_score(temp['score'] , temp['preds']), 4)) 
            val_aucs_top.append(round(roc_auc_score(temp_val['score'] , temp_val['preds']),4))

            accumulated += model_.max_iter

        trained_obs_sub.append(copy.deepcopy(model_))
        
    trained_obs.append(trained_obs_sub)
    train_times[model.name].append(round(accumulated_time/60, 4))
    train_auc_all[model.name].append(train_aucs_all)
    train_auc_top[model.name].append(train_aucs_top)
    val_auc_all[model.name].append(val_aucs_all)
    val_auc_top[model.name].append(val_aucs_top)

    print(model.name)

    model_1 = model_

train_auc_top, val_auc_top

TypeError: list indices must be integers or slices, not str

In [8]:
model_.sim_func.grad_names

['dif_a',
 'dif_b',
 'mult_a',
 'mult_b',
 'add_norm_int',
 'add_norm_a',
 'add_norm_b',
 'query_intensity_a',
 'query_intensity_b',
 'target_intensity_a',
 'target_intensity_b']

In [None]:
with open('/Users/jonahpoczobutt/projects/TunaRes/pickled_models/model_1.pickle', 'rb') as handle:
    model = pickle.load(handle)


for i in model_1.init_vals:

    print(i, getattr(model_1.sim_func,i), getattr(model,i))

In [None]:
with open('/Users/jonahpoczobutt/projects/TunaRes/pickled_models/model_1.pickle', 'rb') as handle:
    model_ = pickle.load(handle)

model_.query_mz = False
model_.target_mz = False
model_.query_mz_offset = False
model_.target_mz_offset = False
model_.query_intensity = False
model_.target_intensity = False
model_.query_normalized_intensity = False
model_.target_normalized_intensity = False

demo_matches['preds'] = [model_.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches))]
#demo_matches_val['preds'] = [model_.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget'],grads = False) for i in range(len(demo_matches_val))]

# train_aucs_all.append(round(roc_auc_score(demo_matches['score'] , demo_matches['preds']), 4)) 
# val_aucs_all.append(round(roc_auc_score(demo_matches_val['score'] , demo_matches_val['preds']),4))

temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
#temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])

train_aucs_top.append(round(roc_auc_score(temp['score'] , temp['preds']), 4)) 
#val_aucs_top.append(round(roc_auc_score(temp_val['score'] , temp_val['preds']),4))

In [None]:
import TunaSims
import TunaSimsOld
import pandas as pd

demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_matches_no_prec.pkl')

new = TunaSims.speedyTuna(query_intensity_a = 1,
                          query_intensity_b = 1,
                          target_intensity_a = 1,
                          target_intensity_b = 1,
                          mult_a = 0.001,
                          mult_b = 1,
                          dif_a= 0.001,
                          dif_b = 1,
                          add_norm_a= 1,
                          add_norm_b= 1)

old = TunaSimsOld.ExpandedTuna(query_normalized_intensity_a = 1,
                          query_normalized_intensity_b = 1,
                          target_normalized_intensity_a = 1,
                          target_normalized_intensity_b = 1,
                          mult_a = 0.001,
                          mult_b = 1,
                          dif_a= 0.001,
                          dif_b = 1,
                          add_norm_a= 1,
                          add_norm_b= 1)

In [None]:
for i in range(999,1001):
    new.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], grads = True)

In [None]:
for i in range(10000):
    new.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], grads = False)

In [None]:
for i in range(999,1001):
    old.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = True)

In [None]:
for i in range(10000):
    old.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False)

In [None]:
testy = pd.DataFrame([1 for i in range(1000)] + [5 for i in range(100)] + [3 for i in range(50000)], columns = ['yoop'])

In [None]:
testy.sort_values(by='yoop', inplace = True)

In [None]:
a = [i for i in range(1000)]
for i in range(10000):
    
    np.sum(a)

In [None]:
for i in model_.init_vals:

    print(i, getattr(model_.sim_func, i))

In [None]:
with open('/Users/jonahpoczobutt/projects/TunaRes/pickled_models/model_1.pickle', 'rb') as handle:

    model_ = pickle.load(handle)

In [None]:
for i in model_.init_vals:

    print(i, getattr(model, i))

In [None]:
model_.sim_func.nonzero_indices

In [None]:
model_.n_iter

In [None]:
model_.sim_func.target

In [None]:
for i in model_.sim_func.grads1.keys():
    print(i, getattr(model_.sim_func,i))

In [None]:
import TunaSims

mer = TunaSims.ScoreByQuery(raw_scores_int = 0,
                            raw_scores_a = 1,
                            raw_scores_b = 1,
                            dif_from_top_int = 0,
                            dif_from_top_a = -1,
                            dif_from_top_b = 1)

In [None]:
import funcOb
import TunaSims

inits = {'raw_scores_int' :0,
        'raw_scores_a' : 1,
        'raw_scores_b' : 1,
        'dif_from_top_int' : 0,
        'dif_from_top_a' : -1,
        'dif_from_top_b' : 1}

fixed_vals = {}

a = funcOb.scoreByQueryFunc(name = 'testy',
                            init_vals = inits,
                            fixed_vals = fixed_vals)

In [None]:
import numpy as np
np.dot(1,2)

In [None]:
import numpy as np
np.log(0)

In [None]:
a = np.array([None, 1])
(a == None).astype(int)

In [None]:
np.dot([1,2], [1,2])

In [None]:
mer.predict(scores = [0.9, 0.8, 0.5], match_names = ['a', 'b', 'c'], grads = True)

need to add the object attributes back so that we ca properly adjust gradients...for both

In [None]:
import numpy as np
np.sum([[1,2,3], [1,2,3]], axis = 0)

In [None]:
a = np.array([0.6,1,0,1])
sort_order = np.argsort(-a)
mask = (sort_order == 0)





In [None]:
from math import prod

np.sum([np.array([1,2,3]),np.array([4,5,6])], axis = 0)

In [None]:
a[sort_order]

In [None]:
import math
np.prod(np.vstack((a[sort_order], mask,a)),axis = 0)

In [None]:
a[a==2]

In [None]:
import matplotlib.pyplot as plt
plt.hist(temp['preds'], bins = 100)
plt.title('Preds Train')
plt.show()

plt.hist(temp_val['preds'], bins = 100)
plt.title('Preds Val')
plt.show()

In [None]:
trues = np.array([0,1,0,0,0,0])


#np.sum(np.trues - preds)

In [None]:
print('function of remaining scores')
print(np.concatenate((preds,[0])))
print(np.max(preds) - np.concatenate((preds,[0])))

print('then move through in reverse for scores above')
print(np.concatenate((preds[::-1],[0])))
print(np.concatenate((preds[::-1],[0])) - np.min(preds))

In [None]:
preds = np.array([0.45,0.4,0.3])
padded = np.concatenate(([1-preds[0]], preds))
max_dif = (np.max(padded) - padded)
prob_above = np.array([sum(padded[:i]) for i in range(len(padded))])
print(padded)
print(max_dif)
print(prob_above)


#should also have function of other scores summed after a non-linear transformation
#array[not]


In [None]:
np.array(['a',None])

In [None]:
def get_none_prob(max_prob):
    """ 
    probs must already be sorted from max to min and have
    candidate names in corresponding order
    """

    


Grab only inchicores where performance was bad

Round 2

In [None]:
temp['residual'] = np.abs(temp['score'] - temp['preds'])
median_residual = np.median(temp['residual'])
print(len(temp[temp['score'] == 1])/len(temp))

pos = 0
neg = 0
bad_ids = list()
for i in range(len(temp)):

    if temp.iloc[i]['residual'] >= median_residual:
        bad_ids.append(temp.iloc[i]['queryID_target_base'])

        if temp.iloc[i]['score'] == 1:
            pos+=1

        else:
            neg+=1

bad_ids = set(bad_ids)

residual_inds = list()

for i in range(len(demo_matches)):

    if demo_matches.iloc[i]['queryID_target_base'] in bad_ids:

        residual_inds.append(i)

demo_matches = demo_matches.iloc[residual_inds]
print(len(demo_matches))
print(len(bad_ids))
print(pos / (pos + neg))

train_auc_top = {i.name: list() for i in func_obs}
val_auc_top = {i.name: list() for i in func_obs}

train_auc_all = {i.name: list() for i in func_obs}
val_auc_all = {i.name: list() for i in func_obs}

train_times = {i.name: list() for i in func_obs}

absolutes = [0,1e5]
offsets = [absolutes[i+1] - absolutes[i] for i in range(len(absolutes)-1)]

reps = 1

trained_obs = []

for model in func_obs:

    for _ in range(reps):

        model_ = copy.deepcopy(model)

        accumulated = 0
        accumulated_time = 0
        train_aucs_top = list()
        val_aucs_top = list()
        train_aucs_all = list()
        val_aucs_all = list()
        trained_obs_sub = list()

        for i in offsets:
            
            model_.max_iter = i
            
            start = time.time()
            model_.fit(demo_matches)
            accumulated_time += time.time() - start

            demo_matches['preds'] = [model_.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches))]
            demo_matches_val['preds'] = [model_.sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches_val))]

            train_aucs_all.append(round(roc_auc_score(demo_matches['score'] , demo_matches['preds']), 4)) 
            val_aucs_all.append(round(roc_auc_score(demo_matches_val['score'] , demo_matches_val['preds']),4))

            temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
            temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])

            train_aucs_top.append(round(roc_auc_score(temp['score'] , temp['preds']), 4)) 
            val_aucs_top.append(round(roc_auc_score(temp_val['score'] , temp_val['preds']),4))

            accumulated += model_.max_iter

        trained_obs_sub.append(copy.deepcopy(model_))
        
    trained_obs.append(trained_obs_sub)
    train_times[model.name].append(round(accumulated_time/60, 4))
    train_auc_all[model.name].append(train_aucs_all)
    train_auc_top[model.name].append(train_aucs_top)
    val_auc_all[model.name].append(val_aucs_all)
    val_auc_top[model.name].append(val_aucs_top)

    print(model.name)

    model_2 = model_

train_auc_top, val_auc_top

Round 3

In [None]:
temp['residual'] = np.abs(temp['score'] - temp['preds'])
median_residual = np.median(temp['residual'])
print(len(temp[temp['score'] == 1])/len(temp))
print(median_residual)

pos = 0
neg = 0
bad_ids = list()
for i in range(len(temp)):

    if temp.iloc[i]['residual'] >= median_residual:
        bad_ids.append(temp.iloc[i]['queryID_target_base'])

        if temp.iloc[i]['score'] == 1:
            pos+=1

        else:
            neg+=1

bad_ids = set(bad_ids)

residual_inds = list()

for i in range(len(demo_matches)):

    if demo_matches.iloc[i]['queryID_target_base'] in bad_ids:

        residual_inds.append(i)

demo_matches = demo_matches.iloc[residual_inds]
print(len(demo_matches))
print(len(bad_ids))
print(pos / (pos + neg))

train_auc_top = {i.name: list() for i in func_obs}
val_auc_top = {i.name: list() for i in func_obs}

train_auc_all = {i.name: list() for i in func_obs}
val_auc_all = {i.name: list() for i in func_obs}

train_times = {i.name: list() for i in func_obs}

absolutes = [0,1e5]
offsets = [absolutes[i+1] - absolutes[i] for i in range(len(absolutes)-1)]

reps = 1

trained_obs = []

for model in func_obs:

    for _ in range(reps):

        model_ = copy.deepcopy(model)

        accumulated = 0
        accumulated_time = 0
        train_aucs_top = list()
        val_aucs_top = list()
        train_aucs_all = list()
        val_aucs_all = list()
        trained_obs_sub = list()

        for i in offsets:
            
            model_.max_iter = i
            
            start = time.time()
            model_.fit(demo_matches)
            accumulated_time += time.time() - start

            demo_matches['preds'] = [model_.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches))]
            demo_matches_val['preds'] = [model_.sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches_val))]

            train_aucs_all.append(round(roc_auc_score(demo_matches['score'] , demo_matches['preds']), 4)) 
            val_aucs_all.append(round(roc_auc_score(demo_matches_val['score'] , demo_matches_val['preds']),4))

            temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
            temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])

            train_aucs_top.append(round(roc_auc_score(temp['score'] , temp['preds']), 4)) 
            val_aucs_top.append(round(roc_auc_score(temp_val['score'] , temp_val['preds']),4))

            accumulated += model_.max_iter

        trained_obs_sub.append(copy.deepcopy(model_))
        
    trained_obs.append(trained_obs_sub)
    train_times[model.name].append(round(accumulated_time/60, 4))
    train_auc_all[model.name].append(train_aucs_all)
    train_auc_top[model.name].append(train_aucs_top)
    val_auc_all[model.name].append(val_aucs_all)
    val_auc_top[model.name].append(val_aucs_top)

    print(model.name)

    model_3 = model_

train_auc_top, val_auc_top

In [None]:
temp['residual'] = np.abs(temp['score'] - temp['preds'])
median_residual = np.median(temp['residual'])
print(len(temp[temp['score'] == 1])/len(temp))
print(median_residual)

pos = 0
neg = 0
bad_ids = list()
for i in range(len(temp)):

    if temp.iloc[i]['residual'] >= median_residual:
        bad_ids.append(temp.iloc[i]['queryID_target_base'])

        if temp.iloc[i]['score'] == 1:
            pos+=1

        else:
            neg+=1

bad_ids = set(bad_ids)

residual_inds = list()

for i in range(len(demo_matches)):

    if demo_matches.iloc[i]['queryID_target_base'] in bad_ids:

        residual_inds.append(i)

demo_matches = demo_matches.iloc[residual_inds]
print(len(demo_matches))
print(len(bad_ids))
print(pos / (pos + neg))

train_auc_top = {i.name: list() for i in func_obs}
val_auc_top = {i.name: list() for i in func_obs}

train_auc_all = {i.name: list() for i in func_obs}
val_auc_all = {i.name: list() for i in func_obs}

train_times = {i.name: list() for i in func_obs}

absolutes = [0,1e5]
offsets = [absolutes[i+1] - absolutes[i] for i in range(len(absolutes)-1)]

reps = 1

trained_obs = []

for model in func_obs:

    for _ in range(reps):

        model_ = copy.deepcopy(model)

        accumulated = 0
        accumulated_time = 0
        train_aucs_top = list()
        val_aucs_top = list()
        train_aucs_all = list()
        val_aucs_all = list()
        trained_obs_sub = list()

        for i in offsets:
            
            model_.max_iter = i
            
            start = time.time()
            model_.fit(demo_matches)
            accumulated_time += time.time() - start

            demo_matches['preds'] = [model_.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches))]
            demo_matches_val['preds'] = [model_.sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches_val))]

            train_aucs_all.append(round(roc_auc_score(demo_matches['score'] , demo_matches['preds']), 4)) 
            val_aucs_all.append(round(roc_auc_score(demo_matches_val['score'] , demo_matches_val['preds']),4))

            temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
            temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])

            train_aucs_top.append(round(roc_auc_score(temp['score'] , temp['preds']), 4)) 
            val_aucs_top.append(round(roc_auc_score(temp_val['score'] , temp_val['preds']),4))

            accumulated += model_.max_iter

        trained_obs_sub.append(copy.deepcopy(model_))
        
    trained_obs.append(trained_obs_sub)
    train_times[model.name].append(round(accumulated_time/60, 4))
    train_auc_all[model.name].append(train_aucs_all)
    train_auc_top[model.name].append(train_aucs_top)
    val_auc_all[model.name].append(val_aucs_all)
    val_auc_top[model.name].append(val_aucs_top)

    print(model.name)

    model_4 = model_

train_auc_top, val_auc_top

In [None]:
for model in [model_1, model_2, model_3, model_4]:

    print('\n')
    for i in init_vals:
        print(i, round(getattr(model.sim_func, i),2))

In [None]:
demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_matches_no_prec.pkl')

all_scores_train = dict()
all_scores_val = dict()

models = [model_1, model_2, model_3, model_4]
mod_names = ['model_1', 'model_2', 'model_3', 'model_4']

for _ in range(len(models)):

    print(mod_names[_])

    all_scores_train[mod_names[_]] = [models[_].sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches))]
    
all_scores_train['queryID'] = demo_matches['queryID'].tolist()
all_scores_train['target_base'] = demo_matches['target_base'].tolist()
all_scores_train['score'] = demo_matches['score'].tolist()

del(demo_matches)

demo_matches_val = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/inputs/demo_matches_val_no_prec.pkl')

for _ in range(len(models)):

    all_scores_val[mod_names[_]] = [models[_].sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches_val))]
    print(mod_names[_])

all_scores_val['queryID'] = demo_matches_val['queryID'].tolist()
all_scores_val['target_base'] = demo_matches_val['target_base'].tolist()
all_scores_val['score'] = demo_matches_val['score'].tolist()
del(demo_matches_val)

for sim in sim_names:

    print(sim)

    all_scores_train[sim] = np.load(f'{sims_output_dir}/train_{sim}.npy')
    all_scores_val[sim] = np.load(f'{sims_output_dir}/val_{sim}.npy')

all_scores_train = pd.DataFrame(all_scores_train)
all_scores_val = pd.DataFrame(all_scores_val)

all_scores_train.to_pickle('/Users/jonahpoczobutt/projects/TunaRes/sim_scores/train.pickle')
all_scores_val.to_pickle('/Users/jonahpoczobutt/projects/TunaRes/sim_scores/val.pickle')


In [None]:
train_data = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/sim_scores/train.pickle')
val_data = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/sim_scores/val.pickle')

In [None]:
max_scores_train = train_data.groupby(['queryID', 'target_base']).max()
max_scores_val = val_data.groupby(['queryID', 'target_base']).max()

In [None]:
for col in max_scores_train.columns[:-1]:

    print(f"{col}: train: {round(roc_auc_score(max_scores_train['score'], max_scores_train[col]),4)} val: {round(roc_auc_score(max_scores_val['score'], max_scores_val[col]),4)}")

Train Correlations


In [None]:
train_data.iloc[:,:-3].corr()

Now Train Models with Each Pair/Triplet of Sim Scores Old and New

Create column groups

In [None]:
sim_names = ['prob', 'matusita', 'entropy', 'dot', 'lorentzian', 'harmonic']

old_sim_combos = list()
for n in range(1,7):

    for comb in combinations(sim_names, n):
        old_sim_combos.append(list(comb))


new_sim_combos = list()
new_sims = ['model_1', 'model_2', 'model_3', 'model_4']
for n in range(1,5):

    for comb in combinations(new_sims, n):
        new_sim_combos.append(list(comb))

Train Models for each Column Group

In [None]:
sim_performance_old = dict()

trained = 0
consolidated = True
for combo in old_sim_combos:

    model = hgbc()
    if consolidated:
        train = max_scores_train
        val = max_scores_val

    else:
        train = train_data.copy()
        val = val_data.copy()

    model.fit(train[combo], train['score'])

    if consolidated:

        preds = model.predict_proba(train[combo])[:,1]
        preds_val = model.predict_proba(val[combo])[:,1]

    else:

        train['preds'] = model.predict_proba(train[combo])[:,1]
        val['preds'] = model.predict_proba(val[combo])[:,1]

        train = train.groupby(['queryID', 'target_base']).max()
        val = val.groupby(['queryID', 'target_base']).max()

        preds = train['preds']
        preds_val = val['preds']
    
    train_auc = roc_auc_score(train['score'], preds)
    val_auc = roc_auc_score(val['score'], preds_val)

    sim_performance_old['-'.join(combo)] = (train_auc, val_auc)

    trained +=1
    if trained % 10 == 0:
        print(trained)

sim_performance_new = dict()
trained = 0
for combo in new_sim_combos:

    model = hgbc()
    if consolidated:
        train = max_scores_train
        val = max_scores_val

    else:
        train = train_data.copy()
        val = val_data.copy()

    model.fit(train[combo], train['score'])

    if consolidated:
        
        preds = model.predict_proba(train[combo])[:,1]
        preds_val = model.predict_proba(val[combo])[:,1]

    else:

        train['preds'] = model.predict_proba(train[combo])[:,1]
        val['preds'] = model.predict_proba(val[combo])[:,1]

        train = train.groupby(['queryID', 'target_base']).max()
        val = val.groupby(['queryID', 'target_base']).max()

        preds = train['preds']
        preds_val = val['preds']
    
    train_auc = roc_auc_score(train['score'], preds)
    val_auc = roc_auc_score(val['score'], preds_val)

    sim_performance_new['-'.join(combo)] = (train_auc, val_auc)

    trained +=1
    if trained % 10 == 0:
        print(trained)



Train Performance

In [None]:
print(np.mean([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==1]), np.max([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==1]))
print(np.mean([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==2]), np.max([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==2]))
print(np.mean([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==3]), np.max([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==3]))
print(np.mean([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==4]), np.max([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==4]))
print(np.mean([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==5]), np.max([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==5]))
print(np.mean([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==6]), np.max([val[0] for key, val in sim_performance_old.items() if len(key.split('-'))==6]))

In [None]:
print(np.mean([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==1]), np.max([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==1]))
print(np.mean([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==2]), np.max([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==2]))
print(np.mean([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==3]), np.max([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==3]))
print(np.mean([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==4]), np.max([val[0] for key, val in sim_performance_new.items() if len(key.split('-'))==4]))

Val Performance

In [None]:
print(np.mean([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==1]), np.max([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==1]))
print(np.mean([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==2]), np.max([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==2]))
print(np.mean([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==3]), np.max([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==3]))
print(np.mean([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==4]), np.max([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==4]))
print(np.mean([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==5]), np.max([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==5]))
print(np.mean([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==6]), np.max([val[1] for key, val in sim_performance_old.items() if len(key.split('-'))==6]))

In [None]:
print(np.mean([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==1]), np.max([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==1]))
print(np.mean([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==2]), np.max([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==2]))
print(np.mean([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==3]), np.max([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==3]))
print(np.mean([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==4]), np.max([val[1] for key, val in sim_performance_new.items() if len(key.split('-'))==4]))

Best Model Inference

In [None]:
for i in [1,2,3,4,5]:

    performances = [val[1] for key, val in sim_performance_old.items() if len(key.split('-')) == i]
    keys = [key for key, val in sim_performance_old.items() if len(key.split('-')) == i]

    max_key = keys[np.argmax(performances)]
    print(i, max_key, round(sim_performance_old[max_key][0],4), round(sim_performance_old[max_key][1],4))

print('\n')
for i in [1,2,3,4]:

    performances = [val[1] for key, val in sim_performance_new.items() if len(key.split('-')) == i]
    keys = [key for key, val in sim_performance_new.items() if len(key.split('-')) == i]

    max_key = keys[np.argmax(performances)]
    print(i, max_key, round(sim_performance_new[max_key][0],4), round(sim_performance_new[max_key][1],4))

