In [1]:
import TunaSims
import numpy as np
from funcOb import func_ob
import pandas as pd
import tools_fast
from sklearn.metrics import roc_auc_score
import scipy
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
import time
import copy

In [2]:

def _weight_intensity_by_entropy(x):
    WEIGHT_START = 0.25
    ENTROPY_CUTOFF = 3
    weight_slope = (1 - WEIGHT_START) / ENTROPY_CUTOFF

    if np.sum(x) > 0:
        entropy_x = scipy.stats.entropy(x)
        if entropy_x < ENTROPY_CUTOFF:
            weight = WEIGHT_START + weight_slope * entropy_x
            x = np.power(x, weight)
            x_sum = np.sum(x)
            x = x / x_sum
    return x

def ppm(base, ppm):
    """
    convert ppm threshold to dalton based on precursor exact mass (base)
    """

    return base * (ppm / 1e6)



In [3]:
def harmonic_mean_distance(p, q):
    r"""
    Harmonic mean distance:

    .. math::

        1-2\sum(\frac{P_{i}Q_{i}}{P_{i}+Q_{i}})
    """

    return 2 * np.sum(p * q / (p + q))

def lorentzian_distance(p, q):
    r"""
    Lorentzian distance:

    .. math::

        \sum{\ln(1+|P_i-Q_i|)}
    """

    return 1 - np.sum(np.log(1 + np.abs(p - q)))

def matusita_distance(p, q):
    r"""
    Matusita distance:

    .. math::

        \sqrt{\sum(\sqrt{P_{i}}-\sqrt{Q_{i}})^2}
    """

    return 1- np.sum(np.power(np.sqrt(p) - np.sqrt(q), 2))

def probabilistic_symmetric_chi_squared_distance(p, q):
    r"""
    Probabilistic symmetric χ2 distance:

    .. math::

        \frac{1}{2} \times \sum\frac{(P_{i}-Q_{i}\ )^2}{P_{i}+Q_{i}\ }
    """

    return 1- (1 / 2 * np.sum(np.power(p - q, 2) / (p + q)))

def entropy_distance(p, q):
    r"""
    Unweighted entropy distance:

    .. math::

        -\frac{2\times S_{PQ}-S_P-S_Q} {ln(4)}, S_I=\sum_{i} {I_i ln(I_i)}
    """

    merged = p + q
    entropy_increase = 2 * \
                       scipy.stats.entropy(merged) - scipy.stats.entropy(p) - \
                       scipy.stats.entropy(q)
    
    return 1 - entropy_increase

def dot_product_distance(p, q):
    r"""
    Dot product distance:

    .. math::

        1 - \sqrt{\frac{(\sum{Q_iP_i})^2}{\sum{Q_i^2\sum P_i^2}}}
    """
    
    score = np.power(np.sum(q * p), 2) / (
        np.sum(np.power(q, 2)) * np.sum(np.power(p, 2))
    )
    return np.sqrt(score)

def sigmoid(z):
    
        return 1/(1 + np.exp(-z))


Create old similarities

In [5]:
sims_output_dir = '/Users/jonahpoczobutt/projects/TunaRes/oldSimRes'

# demo_matches = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/metlinGnps_NIST20_matchedPol/intermediateOutputs/splitMatches/train/10_ppm/chunk_1.pkl')
# demo_matches_val = pd.read_pickle('/Users/jonahpoczobutt/projects/TunaRes/metlinGnps_NIST20_matchedPol/intermediateOutputs/splitMatches/train/10_ppm/chunk_2.pkl')

demo_matches = pd.read_pickle('demo_matches_no_prec.pkl')
demo_matches_val = pd.read_pickle('demo_matches_val_no_prec.pkl')

for i in range(len(demo_matches)):

    demo_matches.iloc[i]['query'] = demo_matches.iloc[i]['query'][demo_matches.iloc[i]['query'][:,0] < demo_matches.iloc[i]['precquery'] - ppm(demo_matches.iloc[i]['precquery'],3)]
    demo_matches.iloc[i]['target'] = demo_matches.iloc[i]['target'][demo_matches.iloc[i]['target'][:,0] < demo_matches.iloc[i]['prectarget'] - ppm(demo_matches.iloc[i]['prectarget'],3)]

for i in range(len(demo_matches_val)):

    demo_matches_val.iloc[i]['query'] = demo_matches_val.iloc[i]['query'][demo_matches_val.iloc[i]['query'][:,0] < demo_matches_val.iloc[i]['precquery'] - ppm(demo_matches_val.iloc[i]['precquery'],3)]
    demo_matches_val.iloc[i]['target'] = demo_matches_val.iloc[i]['target'][demo_matches_val.iloc[i]['target'][:,0] < demo_matches_val.iloc[i]['prectarget'] - ppm(demo_matches_val.iloc[i]['prectarget'],3)]

demo_matches.to_pickle('demo_matches_no_prec.pkl')
demo_matches_val.to_pickle('demo_matches_val_no_prec.pkl')

demo_matches['score'] = 1 * demo_matches['InchiCoreMatch']
demo_matches_val['score'] = 1 * demo_matches_val['InchiCoreMatch']
demo_matches['queryID_target_base'] = [str(demo_matches.iloc[i]['queryID']) + '_' + demo_matches.iloc[i]['target_base'] for i in range(len(demo_matches))]

sim_names = ['prob','matusita','entropy','dot','lorentzian','harmonic']
distances = [probabilistic_symmetric_chi_squared_distance,
             matusita_distance,
             entropy_distance,
             dot_product_distance,
             lorentzian_distance,
             harmonic_mean_distance]

# for _ in range(len(sim_names)):

#      matched_scores_val = list()
#      for i in range(len(demo_matches_val)):
     
#           matched = tools_fast.match_spectrum(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], ms2_da = 0.05)
#           matched_scores_val.append(sigmoid(distances[_](matched[:,1]/sum(matched[:,1]), matched[:,2]/sum(matched[:,2]))))

#      np.save(f'{sims_output_dir}/val_{sim_names[_]}.npy', np.array(matched_scores_val))

#      matched_scores = list()
#      for i in range(len(demo_matches)):

#           matched = tools_fast.match_spectrum(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'], ms2_da = 0.05)
#           matched_scores.append(sigmoid(distances[_](matched[:,1]/sum(matched[:,1]), matched[:,2]/sum(matched[:,2]))))

#      np.save(f'{sims_output_dir}/train_{sim_names[_]}.npy', np.array(matched_scores))


In [5]:
init_vals = {
    'mult_a' : 0.001,
    'mult_b': 1,
    'dif_a': 0.001,
    'dif_b':1,
    'add_norm_b' : 1,
    # 'target_normalized_intensity_int': 0,
    # 'query_normalized_intensity_int': 0,
    'target_normalized_intensity_a': 0.1,
    'query_normalized_intensity_a': 0.1,
    'target_normalized_intensity_b': 0.1,
    'query_normalized_intensity_b': 0.1,
    # 'target_normalized_intensity_c': 0.1,
    # 'query_normalized_intensity_c': 0.1,
    # 'target_mz_int': 1e-10,
    # 'query_mz_int': 1e-10,
    # 'target_mz_a': 0.001,
    # 'query_mz_a': 0.001,
    # 'target_mz_b': 0.001,
    # 'query_mz_b': 0.001,
    # 'target_mz_c': 0.001,
    # 'query_mz_c': 0.001,
    # 'target_intensity_int': 0,
    # 'query_intensity_int': 0,
    # 'target_intensity_a': 1,
    # 'query_intensity_a': 1,
    # 'target_intensity_b': 1,
    # 'query_intensity_b': 1,
    # 'target_intensity_c': 1,
    # 'query_intensity_c': 1,
    }

init_vals_2 = {
    'mult_a' : 0.001,
    'mult_b': 1,
    'dif_a': 0.001,
    'dif_b':1,
    'add_norm_b' : 1,
    # 'target_normalized_intensity_int': 0,
    # 'query_normalized_intensity_int': 0,
    'target_normalized_intensity_a': 0.1,
    'query_normalized_intensity_a': 0.1,
    'target_normalized_intensity_b': 0.1,
    'query_normalized_intensity_b': 0.1,
    # 'target_normalized_intensity_c': 0.1,
    # 'query_normalized_intensity_c': 0.1,
    'target_mz_int': 1e-10,
    'query_mz_int': 1e-10,
    'target_mz_a': 0.001,
    'query_mz_a': 0.001,
    'target_mz_b': 0.001,
    'query_mz_b': 0.001,
    # 'target_mz_c': 0.001,
    # 'query_mz_c': 0.001,
    # 'target_intensity_int': 0,
    # 'query_intensity_int': 0,
    # 'target_intensity_a': 1,
    # 'query_intensity_a': 1,
    # 'target_intensity_b': 1,
    # 'query_intensity_b': 1,
    # 'target_intensity_c': 1,
    # 'query_intensity_c': 1,
    }

regularization_grad = lambda x: 0.

fixed_vals = {'sigmoid_score' : True, 
              'weight_combine': 'multiply'
    }

fixed_vals_2 = {'sigmoid_score' : True, 
              'weight_combine': 'add'
    }

bounds = {'add_norm_b': (0, 2),
          'mult_b': (1e-10, 2),
          'add_norm_a': (1e-10, 3),
          'dif_b': (1e-10, 2),
          'dif_a':(-1.5,1.5),
          'mult_a': (-1.5,1.5),
          'target_normalized_intensity_int': (-0.2,1),
          'query_normalized_intensity_int': (-0.2,1),
          'target_normalized_intensity_a': (1e-10,2),
          'query_normalized_intensity_a': (1e-10,2),
          'target_normalized_intensity_b': (0,2),
          'query_normalized_intensity_b': (0,2),
          'target_normalized_intensity_c': (-2,2),
          'query_normalized_intensity_c': (-2,2),
          'target_mz_b': (-2,2),
          'query_mz_b': (-2,2),
          'target_mz_a': (-2,2),
          'query_mz_a': (-2,2),
          'target_mz_int': (-0.2,1),
          'query_mz_int': (-0.2,1),
          'target_mz_c': (-2,2),
          'query_mz_c': (-2,2),
           'target_intensity_int': (-0.2,1),
           'query_intensity_int': (-0.2,1),
          'target_intensity_a': (1e-10,2),
          'query_intensity_a': (1e-10,2),
          'target_intensity_b': (1e-10,2),
          'query_intensity_b': (1e-10,2),
          'target_intensity_c': (1e-10,2),
          'query_intensity_c': (1e-10,2),
          }

testerooni = func_ob('testerooni',
                     sim_func = TunaSims.ExpandedTuna,
                     init_vals = init_vals,
                     fixed_vals = fixed_vals,
                     regularization_grad = regularization_grad,
                     bounds = bounds,
                     max_iter = 1e6,
                     learning_rates = 0.001,
                     momentum_type = None,
                     learning_rate_scheduler = None,
                     tol = 0,
                     balance_classes = False)

init_names = ['intensity','intensitymz']
inits = [init_vals, init_vals_2]
func_obs = list()

for i in range(1):
    for momentum in [None]:
        for sched in ['ad_mult']:
            for i in range(len(inits)):

                # for key, value in init_vals.items():
                    
                #     init_vals[key] = np.random.uniform(bounds[key][0], bounds[key][1])
                
                # for key, value in init_vals.items():
                    
                #     init_vals[key] = np.random.uniform(bounds[key][0], bounds[key][1])
                
                # func_obs.append(func_ob(f'{momentum}_{sched}_all_{i}',
                #             sim_func = TunaSims.ExpandedTuna,
                #             init_vals = init_vals.copy(),
                #             fixed_vals = fixed_vals,
                #             regularization_grad = regularization_grad,
                #             bounds = bounds,
                #             max_iter = 1e6,
                #             learning_rates = 0.001,
                #             momentum_type = momentum,
                #             learning_rate_scheduler = sched,
                #             learning_beta = 0.5,
                #             momentum_beta = 0.5,
                #             tol = 0,
                #             balance_classes = True))
                
                func_obs.append(func_ob(f'{momentum}_{sched}_{init_names[i]}',
                            sim_func = TunaSims.ExpandedTuna,
                            init_vals = inits[i].copy(),
                            fixed_vals = fixed_vals,
                            regularization_grad = regularization_grad,
                            bounds = bounds,
                            max_iter = 1e6,
                            learning_rates = 0.001,
                            momentum_type = momentum,
                            learning_rate_scheduler = sched,
                            learning_beta = 0.5,
                            momentum_beta = 0.5,
                            tol = 0,
                            balance_classes = True,
                            groupby_column = 'queryID_target_base'))

In [7]:
train_auc_top = {i.name: list() for i in func_obs}
val_auc_top = {i.name: list() for i in func_obs}

train_auc_all = {i.name: list() for i in func_obs}
val_auc_all = {i.name: list() for i in func_obs}

train_times = {i.name: list() for i in func_obs}

absolutes = [0, 1e3, 1e4, 1e5]
offsets = [absolutes[i+1] - absolutes[i] for i in range(len(absolutes)-1)][:5]

reps = 2

trained_obs = []

for model in func_obs:

    for _ in range(reps):

        model_ = copy.deepcopy(model)

        accumulated = 0
        accumulated_time = 0
        train_aucs_top = list()
        val_aucs_top = list()
        train_aucs_all = list()
        val_aucs_all = list()
        trained_obs_sub = list()

        for i in offsets:
            
            if 'top' in model.name:
                model_.max_iter = i / 10
            else:
                model_.max_iter = i
            
            start = time.time()
            model_.fit(demo_matches)
            accumulated_time += time.time() - start

            demo_matches['preds'] = [model_.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches))]
            demo_matches_val['preds'] = [model_.sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget']) for i in range(len(demo_matches_val))]

            train_aucs_all.append(round(roc_auc_score(demo_matches['score'] , demo_matches['preds']), 4)) 
            val_aucs_all.append(round(roc_auc_score(demo_matches_val['score'] , demo_matches_val['preds']),4))

            temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
            temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])

            train_aucs_top.append(round(roc_auc_score(temp['score'] , temp['preds']), 4)) 
            val_aucs_top.append(round(roc_auc_score(temp_val['score'] , temp_val['preds']),4))

            accumulated += model_.max_iter

        trained_obs_sub.append(copy.deepcopy(model_))
        
    trained_obs.append(trained_obs_sub)
    train_times[model.name].append(round(accumulated_time/60, 4))
    train_auc_all[model.name].append(train_aucs_all)
    train_auc_top[model.name].append(train_aucs_top)
    val_auc_all[model.name].append(val_aucs_all)
    val_auc_top[model.name].append(val_aucs_top)

    print(model.name)

key='mult_a', current_value=0.001, updated=0.0009393207051327057, learning_rate=0.002, unweighted_step=0.030339647433647164, grad=0.030339647433647164, step=6.067929486729433e-05
key='mult_b', current_value=1, updated=1.0000001697316352, learning_rate=0.001, unweighted_step=-0.00016973163526435596, grad=-0.00016973163526435596, step=-1.6973163526435597e-07
key='dif_a', current_value=0.001, updated=-0.03735046462097834, learning_rate=0.002, unweighted_step=19.175232310489168, grad=19.175232310489168, step=0.03835046462097834
key='dif_b', current_value=1, updated=1.00005603640226, learning_rate=0.001, unweighted_step=-0.05603640226000586, grad=-0.05603640226000586, step=-5.603640226000587e-05
key='add_norm_b', current_value=1, updated=0.9998885287249206, learning_rate=0.002, unweighted_step=0.05573563753966625, grad=0.05573563753966625, step=0.0001114712750793325
key='target_normalized_intensity_a', current_value=0.1, updated=0.10000461885866775, learning_rate=0.001, unweighted_step=-0.0

ValueError: loss grad is nan

In [None]:
TunaSims.TunaSim.sigmoid(0.00001) - 0.5

In [None]:
train_auc_top

In [None]:
val_auc_top

In [None]:
print(yool)

In [None]:
print('trained values')
for i in trained_obs[0][0].init_vals.keys():
    print(i, getattr(trained_obs[0][0].sim_func,i))

In [None]:
for i in range(1):

    print(f'round {i+1}')
    testerooni.fit(demo_matches.copy(), verbose = 1e7)
    demo_matches['preds'] = [testerooni.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget'], grads = False) for i in range(len(demo_matches))]
    demo_matches_val['preds'] = [testerooni.sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget']) for i in range(len(demo_matches_val))]

print('trained values')
for i in testerooni.init_vals.keys():
    print(i, getattr(testerooni.sim_func,i))

print('\n')
print('aucs')
temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
print('Custom ',round(roc_auc_score(temp['score'] , temp['preds']), 4), round(roc_auc_score(temp_val['score'] , temp_val['preds']), 4))

for sim in sim_names:
    demo_matches['preds'] = np.load(f'{sims_output_dir}/train_{sim}.npy')
    demo_matches_val['preds'] = np.load(f'{sims_output_dir}/val_{sim}.npy')

    temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
    temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])

    print(round(roc_auc_score(temp['score'] , temp['preds']), 4), round(roc_auc_score(temp_val['score'] , temp_val['preds']), 4))


In [None]:
for sim in sim_names:
    demo_matches['preds'] = np.load(f'{sims_output_dir}/train_{sim}.npy')
    demo_matches_val['preds'] = np.load(f'{sims_output_dir}/val_{sim}.npy')

    temp = demo_matches.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])
    temp_val = demo_matches_val.groupby(by=['queryID','target_base']).apply(lambda x: x[x['preds'] == max(x['preds'])].iloc[0])

    print(sim, round(roc_auc_score(temp['score'] , temp['preds']), 4), round(roc_auc_score(temp_val['score'] , temp_val['preds']), 4))

In [None]:
import matplotlib.pyplot as plt
plt.hist(preds, bins = 100)
plt.title('Preds Train')
plt.show()

plt.hist(preds_val, bins = 100)
plt.title('Preds Val')
plt.show()

In [None]:
print(yool)

In [None]:
demo_matches['residual'] = np.abs(demo_matches['score'] - preds)
residual_threshold = 0.2




Round 2

In [None]:
original_labels_train = demo_matches['score']
original_labels_val = demo_matches_val['score']
#maintain mapping to 0 1 interval
demo_matches['score'] = (demo_matches['score'] - preds + 1) / 2
plt.hist(demo_matches['score'], bins = 100)
plt.title('train_residuals')
plt.show()

In [None]:
testerooni = func_ob('teesterooni',
                     sim_func = TunaSims.ExpandedTuna,
                     init_vals = init_vals,
                     fixed_vals = fixed_vals,
                     regularization_grad = regularization_grad,
                     bounds = bounds,
                     max_iter = 1000000,
                     lambdas = 0.001,
                     tol = 0,
                     balance_classes = False)

testerooni.fit(demo_matches, verbose = 1e7)
print(testerooni.converged)


preds_2 = np.array([testerooni.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget']) for i in range(len(demo_matches))])
preds_val_2 = np.array([testerooni.sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget']) for i in range(len(demo_matches_val))])

plt.hist(preds_2, bins = 100)
plt.title('Train Preds 2')
plt.show()

plt.hist(preds_2, bins = 100)
plt.title('Val Preds 2')
plt.show()

print('trained values')
for i in testerooni.init_vals.keys():
    print(i, getattr(testerooni.sim_func,i))

preds_combined = preds + (2 * preds_2 - 1)
preds_combined_val = preds_val + (2 * preds_val_2 - 1)

plt.hist((1 + original_labels_train - preds_combined) / 2, bins = 100)
plt.title('Two Stage Train Residuals')
plt.show()

plt.hist((1 + original_labels_val - preds_combined_val) / 2, bins = 100)
plt.title('Two Stage Val Residuals')
plt.show()

print('\n')
print('aucs')
print(round(roc_auc_score(original_labels_train , preds_combined), 4), round(roc_auc_score(original_labels_val, preds_combined_val), 4))
for sim in sim_names:
    print(sim, round(roc_auc_score(original_labels_train, np.load(f'{sims_output_dir}/train_{sim}.npy')),4), round(roc_auc_score(original_labels_val, np.load(f'{sims_output_dir}/val_{sim}.npy')),4))

Round 3

In [None]:

#maintain mapping to 0 1 interval
demo_matches['score'] = (demo_matches['score'] - preds_combined + 1) / 2
plt.hist(demo_matches['score'], bins = 100)
plt.title('train_residuals')
plt.show()

testerooni = func_ob('teesterooni',
                     sim_func = TunaSims.ExpandedTuna,
                     init_vals = init_vals,
                     fixed_vals = fixed_vals,
                     regularization_grad = regularization_grad,
                     bounds = bounds,
                     max_iter = 1000000,
                     lambdas = 0.001,
                     tol = 0,
                     balance_classes = False)

testerooni.fit(demo_matches, verbose = 100000)
print(testerooni.converged)


preds_3 = np.array([testerooni.sim_func.predict(demo_matches.iloc[i]['query'], demo_matches.iloc[i]['target'],demo_matches.iloc[i]['precquery'], demo_matches.iloc[i]['prectarget']) for i in range(len(demo_matches))])
preds_val_3 = np.array([testerooni.sim_func.predict(demo_matches_val.iloc[i]['query'], demo_matches_val.iloc[i]['target'], demo_matches_val.iloc[i]['precquery'], demo_matches_val.iloc[i]['prectarget']) for i in range(len(demo_matches_val))])


In [None]:
all_scores_train = dict()
all_scores_val = dict()

for sim in sim_names:

    all_scores_train[sim] = np.load(f'{sims_output_dir}/train_{sim}.npy')
    all_scores_val[sim] = np.load(f'{sims_output_dir}/val_{sim}.npy')

all_scores_train['preds'] = preds
all_scores_val['preds'] = preds_val

all_scores_train['preds2'] = preds_2
all_scores_val['preds2'] = preds_val_2

all_scores_train['preds3'] = preds_3
all_scores_val['preds3'] = preds_val_3

all_scores_train['score'] = original_labels_train
all_scores_val['score'] = original_labels_val

train_data = pd.DataFrame(all_scores_train)
val_data = pd.DataFrame(all_scores_val)



Train Correlations


In [None]:
train_data.corr()

Val Correlations

In [None]:
val_data.corr()

Now Train Models with Each Pair/Triplet of Sim Scores Old and New

Create column groups

In [None]:
old_sim_combos = list()
for sim1 in sim_names:
    for sim2 in sim_names:
        for sim3 in sim_names:

            old_sim_combos.append(list(set([sim1, sim2, sim3])))

new_sim_combos = list()
new_sims = ['preds', 'preds2', 'preds3']
for sim1 in new_sims:
    for sim2 in new_sims:
        for sim3 in new_sims:  

            new_sim_combos.append(list(set([sim1, sim2, sim3])))

Train Models for each Column Group

In [None]:
sim_performance = dict()

for combo in old_sim_combos:

    print(combo)

    model = hgbc()
    model.fit(train_data[combo], train_data['score'])
    train_auc = roc_auc_score(original_labels_train, model.predict_proba(train_data[combo])[:,1])
    val_auc = roc_auc_score(original_labels_val, model.predict_proba(val_data[combo])[:,1])

    sim_performance['_'.join(combo)] = (train_auc, val_auc)

sim_performance_new = dict()
for combo in new_sim_combos:

    print(combo)

    model = hgbc()
    model.fit(train_data[combo], train_data['score'])
    train_auc = roc_auc_score(original_labels_train, model.predict_proba(train_data[combo])[:,1])
    val_auc = roc_auc_score(original_labels_val, model.predict_proba(val_data[combo])[:,1])

    sim_performance_new['_'.join(combo)] = (train_auc, val_auc)



In [None]:
print(np.mean([val[1] for key, val in sim_performance.items() if len(key.split('_'))==1]))
print(np.mean([val[1] for key, val in sim_performance.items() if len(key.split('_'))==2]))
print(np.mean([val[1] for key, val in sim_performance.items() if len(key.split('_'))==3]))

In [None]:
print(np.mean([val[1] for key, val in sim_performance_new.items() if len(key.split('_'))==1 and 'preds' in key]))
print(np.mean([val[1] for key, val in sim_performance_new.items() if len(key.split('_'))==2 and 'preds' in key]))
print(np.mean([val[1] for key, val in sim_performance_new.items() if len(key.split('_'))==3 and 'preds' in key]))

In [None]:
sim_performance_2 = dict()
for combo in new_sim_combos:

    if 'preds' not in combo:
        continue

    for sim in sim_names:

        combo_new = combo + [sim]

        model = hgbc()
        model.fit(train_data[combo_new], train_data['score'])
        train_auc = roc_auc_score(original_labels_train, model.predict_proba(train_data[combo_new])[:,1])
        val_auc = roc_auc_score(original_labels_val, model.predict_proba(val_data[combo_new])[:,1])

        sim_performance_2['_'.join(combo_new)] = (train_auc, val_auc)

In [None]:
print(np.mean([val[1] for key, val in sim_performance_2.items() if len(key.split('_'))==2]))
print(np.mean([val[1] for key, val in sim_performance_2.items() if len(key.split('_'))==3]))
