In [1]:
import TunaSims
import numpy as np
from funcOb import func_ob
import pandas as pd
import tools_fast

In [2]:
def harmonic_mean_distance(p, q):
    r"""
    Harmonic mean distance:

    .. math::

        1-2\sum(\frac{P_{i}Q_{i}}{P_{i}+Q_{i}})
    """
    return 2 * np.sum(p * q / (p + q))

def lorentzian_distance(p, q):
    r"""
    Lorentzian distance:

    .. math::

        \sum{\ln(1+|P_i-Q_i|)}
    """
    return np.sum(np.log(1 + np.abs(p - q)))

def matusita_distance(p, q):
    r"""
    Matusita distance:

    .. math::

        \sqrt{\sum(\sqrt{P_{i}}-\sqrt{Q_{i}})^2}
    """
    return np.sum(np.power(np.sqrt(p) - np.sqrt(q), 2))

def probabilistic_symmetric_chi_squared_distance(p, q):
    r"""
    Probabilistic symmetric χ2 distance:

    .. math::

        \frac{1}{2} \times \sum\frac{(P_{i}-Q_{i}\ )^2}{P_{i}+Q_{i}\ }
    """
    return 1 / 2 * np.sum(np.power(p - q, 2) / (p + q))

def sigmoid(z):
    
        return 1/(1 + np.exp(-z))


In [3]:
query = np.array([[1,10.],[2,50.]])
target = np.array([[1,20.],[2,40.]])

In [4]:
harmonic_mean = TunaSims.ExpandedTuna(query_normalized_intensity_a = 1,
                                  target_normalized_intensity_a = 1,
                                  sigmoid_score = False,
                                  mult_a = 2,
                                  add_norm_b = 1)

prob = TunaSims.ExpandedTuna(query_normalized_intensity_a = 1,
                                  target_normalized_intensity_a = 1,
                                  sigmoid_score = False,
                                  dif_a = 1/2,
                                  dif_b = 2,
                                  add_norm_b = 1)

In [5]:
print(f'harmonic_mean: {harmonic_mean.predict(query, target) - harmonic_mean_distance(query[:,1]/sum(query[:,1]), target[:,1]/sum(target[:,1]))}')
print(f'Chisquare: {prob.predict(query, target) - probabilistic_symmetric_chi_squared_distance(query[:,1]/sum(query[:,1]), target[:,1]/sum(target[:,1]))}')

harmonic_mean: 2.2075794303688667e-08
Chisquare: -3.449342873829142e-09


In [6]:
demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/metlinGnps_NIST20_matchedPol/intermediateOutputs/splitMatches/train/10_ppm/chunk_1.pkl')
# demo_matches['score'] = 1 - demo_matches['InchiCoreMatch']
# matched_scores = list()
# for i in range(len(demo_matches)):
    
#     matched = tools_fast.match_spectrum(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], ms2_da = 0.05)
#     matched_scores.append(sigmoid(harmonic_mean_distance(matched[:,1]/sum(matched[:,1]), matched[:,2]/sum(matched[:,2]))))


# demo_matches['score'] = matched_scores

demo_matches_base = pd.read_pickle('harmonic_yerp.pkl')

In [29]:
demo_matches['score'] = demo_matches_base['score']

In [30]:
harmonic_mean = TunaSims.ExpandedTuna(query_normalized_intensity_a = 1,
                                  target_normalized_intensity_a = 1,
                                  sigmoid_score = True,
                                  mult_a = 2,
                                  add_norm_b = 1)

harmonic_mean.predict(demo_matches.iloc[0]['query'], demo_matches.iloc[0]['target'])


0.5412237712923665

In [62]:
init_vals = {
    'mult_a' : 2,
    'add_norm_b' : 1,
    'target_normalized_intensity_a': 1,
    'query_normalized_intensity_a': 1    
    }

regularization_grad = lambda x: 0.

fixed_vals = {'sigmoid_score' : True,     
    }

bounds = {'add_norm_b': (0, 2),
          'mult_b': (0, 2),
          'dif_b': (0, 2),
          'add_norm_a': (1e-10, 3),
          'target_normalized_intensity_a': (1e-10,2),
          'query_normalized_intensity_a': (1e-10,2),
          'target_normalized_intensity_b': (1e-10,2),
          'query_normalized_intensity_b': (1e-10,2),}

testerooni = func_ob('teesterooni',
                     sim_func = TunaSims.ExpandedTuna,
                     init_vals = init_vals,
                     fixed_vals = fixed_vals,
                     regularization_grad = regularization_grad,
                     bounds = bounds,
                     max_iter = 30000,
                     lambdas = 3,
                     tol = 0,
                     balance_classes = False)

In [63]:
testerooni.n_iter

0

In [64]:
testerooni.fit(demo_matches, verbose = 1000)
print(testerooni.converged)


completed 1000 iterations
completed 2000 iterations
completed 3000 iterations
completed 4000 iterations
completed 5000 iterations
completed 6000 iterations
completed 7000 iterations
completed 8000 iterations
completed 9000 iterations
completed 10000 iterations
completed 11000 iterations
completed 12000 iterations
completed 13000 iterations
completed 14000 iterations
completed 15000 iterations
completed 16000 iterations
completed 17000 iterations
completed 18000 iterations
completed 19000 iterations
completed 20000 iterations
completed 21000 iterations
completed 22000 iterations
completed 23000 iterations
completed 24000 iterations
completed 25000 iterations
completed 26000 iterations
completed 27000 iterations
completed 28000 iterations
completed 29000 iterations
completed 30000 iterations
False


In [65]:
for i in testerooni.init_vals.keys():
    print(i, getattr(testerooni.sim_func,i))

mult_a 5.240623126207881
dif_a 0.024106377324789274
add_norm_b 0.20823708589607604
target_normalized_intensity_a 0.5943845486895035
query_normalized_intensity_a 0.525849646340274


In [13]:
preds = [testerooni.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target']) for i in range(len(demo_matches))]
#preds_base = [harmonic_mean.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target']) for i in range(len(demo_matches))]

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(roc_auc_score(demo_matches['score'] , 1 - np.array(preds)))
roc_auc_score(demo_matches['score'], 1 - np.array(demo_matches_base['score']))

0.5


0.7003463021288021

In [None]:
from collections import Counter
Counter(demo_matches['score'])

Counter({1: 85421, 0: 14639})

In [None]:
counts = Counter(demo_matches['score']).most_common()

wholes = 1 / (counts[1][1] / (counts[0][1] )) // 1
partials = 1 / (counts[1][1] / (counts[0][1])) % 1

In [None]:
partials

0.8351663364983946

In [None]:
def balanced_upsample(data):
    
    counts = Counter(demo_matches['score']).most_common()

    wholes = 1 / (counts[1][1] / (counts[0][1] )) // 1
    partials = 1 / (counts[1][1] / (counts[0][1])) % 1

    minorities = data[data['score'] == counts[1][0]]
    majorities = data[data['score'] == counts[0][0]]
    counts = Counter(data['score']).most_common()

    return pd.concat([majorities] + [minorities for i in range(int(wholes))]+ [minorities[:int(partials * len(minorities))]])

Unnamed: 0,precquery,prectarget,query,target,queryID,target_base,InchiCoreMatch,InchiKeyMatch,score
59005,217.0496,217.0495,"[[98.9789, 10.7], [136.9821, 12.99], [137.4873...","[[77.0386, 4.0], [79.0541, 10.29], [89.0384, 1...",2947,QXKHYNVANLEOEG,True,True,0
59004,217.0496,217.0495,"[[98.9789, 10.7], [136.9821, 12.99], [137.4873...","[[79.0542, 2.0], [89.0385, 7.49], [90.0463, 15...",2947,QXKHYNVANLEOEG,True,True,0
59003,217.0496,217.0495,"[[98.9789, 10.7], [136.9821, 12.99], [137.4873...","[[89.0385, 13.69], [90.0463, 17.48], [91.0541,...",2947,QXKHYNVANLEOEG,True,True,0
59002,217.0496,217.0495,"[[98.9789, 10.7], [136.9821, 12.99], [137.4873...","[[77.0385, 3.1], [89.0384, 10.49], [90.0463, 1...",2947,QXKHYNVANLEOEG,True,True,0
59001,217.0496,217.0495,"[[98.9789, 10.7], [136.9821, 12.99], [137.4873...","[[77.0385, 4.5], [89.0384, 18.58], [90.0463, 1...",2947,QXKHYNVANLEOEG,True,True,0
...,...,...,...,...,...,...,...,...,...
209049,427.2140,427.2140,"[[110.0595, 6.86], [207.1128, 169.98], [427.21...","[[68.0493, 5.79], [69.0333, 32.17], [70.0652, ...",4536,PMXMIIMHBWHSKN,True,True,0
176405,221.0601,221.0601,"[[32.9811, 115.91], [86.0222, 51.54], [120.012...","[[86.0249, 31.07], [120.0127, 548.35], [134.02...",49307,ILRYLPWNYFXEMH,True,True,0
176409,221.0601,221.0601,"[[32.9811, 115.91], [86.0222, 51.54], [120.012...","[[73.0119, 11.79], [86.0248, 66.73], [100.0405...",49307,ILRYLPWNYFXEMH,True,True,0
176398,221.0601,221.0601,"[[32.9811, 115.91], [86.0222, 51.54], [120.012...","[[86.0248, 14.09], [120.0126, 159.54], [134.02...",49307,ILRYLPWNYFXEMH,True,True,0


In [None]:
demo_matches[demo_matches['score'] == 1]

Unnamed: 0,precquery,prectarget,query,target,queryID,target_base,InchiCoreMatch,InchiKeyMatch,score
589098,389.1980,389.1970,"[[45.30843, 156.0], [45.537243, 70.0], [45.694...","[[55.0187, 1.5], [57.0345, 27.37], [69.0343, 1...",77786,MQIPJISWCOTCGQ,False,False,1
589102,389.1980,389.1970,"[[45.30843, 156.0], [45.537243, 70.0], [45.694...","[[55.0188, 43.66], [57.0345, 56.84], [67.0188,...",77786,MQIPJISWCOTCGQ,False,False,1
589096,389.1980,389.1970,"[[45.30843, 156.0], [45.537243, 70.0], [45.694...","[[83.0499, 2.5], [121.0656, 2.8], [123.0815, 4...",77786,MQIPJISWCOTCGQ,False,False,1
589101,389.1980,389.1970,"[[45.30843, 156.0], [45.537243, 70.0], [45.694...","[[55.0188, 33.97], [57.0345, 66.73], [67.0187,...",77786,MQIPJISWCOTCGQ,False,False,1
589097,389.1980,389.1970,"[[45.30843, 156.0], [45.537243, 70.0], [45.694...","[[57.0345, 20.08], [71.05, 5.09], [83.0502, 96...",77786,MQIPJISWCOTCGQ,False,False,1
...,...,...,...,...,...,...,...,...,...
999376,178.0873,178.0874,"[[105.0725, 5.68], [117.0707, 23.79], [178.087...","[[72.0089, 60.74], [72.0121, 1.4], [74.0245, 9...",57151,LJHYWUVYIKCPGU,False,False,1
932184,178.0873,178.0874,"[[105.0725, 5.68], [117.0707, 23.79], [178.087...","[[72.0091, 14.99], [117.0708, 5.19], [161.0606...",57151,DQLHSFUMICQIMB,False,False,1
999375,178.0873,178.0874,"[[105.0725, 5.68], [117.0707, 23.79], [178.087...","[[72.0089, 54.55], [74.0245, 999.0], [75.0217,...",57151,LJHYWUVYIKCPGU,False,False,1
999382,178.0873,178.0874,"[[105.0725, 5.68], [117.0707, 23.79], [178.087...","[[56.014, 923.98], [72.0089, 509.29], [74.0245...",57151,LJHYWUVYIKCPGU,False,False,1
