In [1]:
%reload_ext autoreload
%autoreload 2

import os, psutil

from rocket_fft import numpy_like, scipy_like

numpy_like()

os.environ['NUMEXPR_MAX_THREADS'] = '20'
os.environ['NUMBA_DEBUGINFO'] = '0'

from alphadia.extraction import processlogger
processlogger.init_logging()
import logging

logger = logging.getLogger()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import neptune.new as neptune
import alphatims.bruker as bruker

from alphabase.spectral_library.base import SpecLibBase
from alphadia.extraction.planning import Plan, Workflow

yaml_file = 'config.yaml'

raw_files = [
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_4dig_scans_S2-C1_1_1805.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_4dig_scans_S2-G1_1_1829.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_4dig_scans_S2-E5_1_1821.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_4dig_scans_S2-D3_1_1813.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_2scans_each150Da_S2-B6_1_1804.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_2scans_each150Da_S2-F6_1_1828.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_2scans_each150Da_S2-E4_1_1820.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_SyP_2scans_each150Da_S2-D2_1_1812.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_P001_diaP_pydiAID8_1300V_S2-D1_1_1811.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_P001_diaP_pydiAID8_1300V_S2-F5_1_1827.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_P001_diaP_pydiAID8_1300V_S2-E3_1_1819.d',
    '/Users/georgwallmann/Documents/data/alphadia_benchmarking/raw_data/2023_04_27_synchroPasef_mDIA/20230422_TIMS05_PaSk_MCT_SA_HeLa_mDIA_P001_diaP_pydiAID8_1300V_S2-H1_1_1835.d'
]


output_location = '/Users/georgwallmann/Documents/data/alphadia_benchmarking/alphadia_runs/2023_04_27_alphadia_mDIA_synchroPasef/data_small_lib_mbr_15ppm'

try:
    neptune_token = os.environ['NEPTUNE_TOKEN']
except KeyError:
    logger.error('NEPTUNE_TOKEN environtment variable not set')

0:00:00.785664 [38;20m INFO: [0m


In [2]:
test_lib = SpecLibBase()
test_lib_location = '/Users/georgwallmann/Documents/data/alphadia_benchmarking/libraries/marvin_scp/MSfragger_library_mod_noLossType_d0_d4_d8_d12_shared_eg_n_fragments_mbr.hdf'
test_lib.load_hdf(test_lib_location, load_mod_seq=True)

In [3]:
import time

In [4]:
plan = Plan(raw_files, config_update = 
            {'extraction':
                {
                    'target_mobility_tolerance': 0.04,
                    'target_rt_tolerance': 30,
                    'target_ms1_tolerance': 15,
                    'min_epochs': 3,
                }
            }
        )
plan.from_spec_lib_base(test_lib)
for dia_data, precursors_flat, fragments_flat in plan.get_run_data():

    start_time = time.time()
    
    reference_flat = precursors_flat[precursors_flat['channel'] == 0].copy()

    workflow = Workflow(
        plan.config, 
        dia_data, 
        reference_flat, 
        fragments_flat
    )

    workflow.calibration()
    d0_df = workflow.extraction()

    raw_name = precursors_flat['raw_name'].iloc[0]
    d0_df.to_csv(os.path.join(output_location, f'{raw_name}_d0.tsv'), sep = '\t', index = False)

    d0_df = d0_df[d0_df['qval'] < 0.01]
    d0_df = d0_df[d0_df['decoy'] == 0]
    workflow.calibration_manager.predict(precursors_flat, 'precursor')
    workflow.calibration_manager.predict(fragments_flat, 'fragment')

    from alphadia.extraction import utils, plexscoring, quadrupole
    import alphatims.utils
    from tqdm import tqdm
    import numba as nb

    multiplex = plexscoring.Multiplexer(precursors_flat, fragments_flat, d0_df.copy())
    candidates_df = multiplex()
    candidates_df['rank'] = np.zeros(len(candidates_df), dtype = np.int64)
    candidates_df = utils.calculate_score_groups(candidates_df, group_channels=True)

    score_group_container = plexscoring.ScoreGroupContainer()
    score_group_container.build_from_df(
        candidates_df['elution_group_idx'].values.astype(np.uint32),
        candidates_df['score_group_idx'].values.astype(np.uint32),
        candidates_df['precursor_idx'].values.astype(np.uint32),
        candidates_df['channel'].values.astype(np.uint8),
        candidates_df['flat_frag_start_idx'].values.astype(np.uint32),
        candidates_df['flat_frag_stop_idx'].values.astype(np.uint32),

        candidates_df['scan_start'].values,
        candidates_df['scan_stop'].values,
        candidates_df['scan_center'].values,
        candidates_df['frame_start'].values,
        candidates_df['frame_stop'].values,
        candidates_df['frame_center'].values,

        candidates_df['charge'].values,
        candidates_df['mz_calibrated'].values.astype(np.float32),
        candidates_df[utils.get_isotope_column_names(candidates_df.columns)].values.astype(np.float32),
    )

    q = quadrupole.SimpleQuadrupole(dia_data.cycle)
    fragment_container = plexscoring.assemble_fragments(fragments_flat)

    config = plexscoring.CandidateConfig()
    config.max_cardinality = 1
    config.score_grouped = True

    alphatims.utils.set_threads(10)

    plexscoring._executor(
        range(len(score_group_container)), 
        score_group_container,
        fragment_container,
        dia_data,
        config.jitclass(),
        q.jit,
        False
    )

    dict_list = []
    precursor_idx_list = []
    channel_list = []


    for elem in tqdm(score_group_container):
        for i, candidate in enumerate(elem.candidates):
            if (len(candidate.features) > 0) and (candidate.channel != 0):
                
                precursor_idx_list.append(candidate.precursor_idx)
                dict_list.append(candidate.features)
                channel_list.append(candidate.channel)

    df = pd.DataFrame(dict_list)
    df['precursor_idx'] = precursor_idx_list
    df['channel'] = channel_list

    df = df.merge(
        precursors_flat[['precursor_idx', 'decoy', 'proteins',]],
        on='precursor_idx',
        how='left'
    )

    features_df = df[set(df.columns) - set(['top3_reference_template_frame_cosine','top3_reference_template_scan_cosine', 'top3_y_ion_correlation','top3_b_ion_correlation'])].copy()
    all_feature_columns = list(set(features_df.columns) - set(['channel', 'precursor_idx','decoy','proteins']))

    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve
    from sklearn.metrics import auc
    from sklearn.neural_network import MLPClassifier

    from alphadia.library import fdr_to_q_values

    output_dfs = []

    for channel in [4,8]:
        channel_df = features_df[features_df['channel'].isin([channel, 12])]
        channel_df['decoy'] = np.zeros(len(channel_df))
        channel_df.loc[channel_df['channel'] == 12, 'decoy'] = 1

        channel_df = channel_df.dropna()

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('GBC',MLPClassifier(hidden_layer_sizes=(50, 25, 5), max_iter=1000, alpha=0.1, learning_rate='adaptive', learning_rate_init=0.001, early_stopping=True, tol=1e-6))
        ])

        X = channel_df[all_feature_columns].values
        y = channel_df['decoy'].values

        
        

        print(X.shape)
        print(y.shape)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        pipeline.fit(X_train, y_train)
        

        y_test_proba = pipeline.predict_proba(X_test)[:,1]
        y_test_pred = np.round(y_test_proba)

        y_train_proba = pipeline.predict_proba(X_train)[:,1]
        y_train_pred = np.round(y_train_proba)

        channel_df['proba'] = pipeline.predict_proba(X)[:,1]
        # subset to the best candidate for every precursor
        channel_df = channel_df.sort_values(by=['proba'], ascending=True)
        features_best_df = channel_df


        # ROC curve
        fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
        roc_auc_test = auc(fpr_test, tpr_test)

        fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
        roc_auc_train = auc(fpr_train, tpr_train)

        
        
        # plotting

        fig, axs = plt.subplots(ncols=3, figsize=(12,3.5))

        axs[0].plot(fpr_test, tpr_test,label="ROC test (area = %0.2f)" % roc_auc_test)
        axs[0].plot(fpr_train, tpr_train,label="ROC train (area = %0.2f)" % roc_auc_train)

        axs[0].plot([0, 1], [0, 1], color="k", linestyle="--")
        axs[0].set_xlim([0.0, 1.0])
        axs[0].set_ylim([0.0, 1.05])
        axs[0].set_xlabel("false positive rate")
        axs[0].set_ylabel("true positive rate")
        axs[0].set_title("ROC Curve")
        axs[0].legend(loc="lower right")
        
        sns.histplot(data=features_best_df, x='proba', hue='decoy', bins=30, element="step", fill=False, ax=axs[1])
        axs[1].set_xlabel('score')
        axs[1].set_ylabel('number of precursors')
        axs[1].set_title("Score Distribution")

        features_best_df = features_best_df.sort_values(['proba'], ascending=True)
        target_values = 1-features_best_df['decoy'].values
        decoy_cumsum = np.cumsum(features_best_df['decoy'].values)
        target_cumsum = np.cumsum(target_values)
        fdr_values = decoy_cumsum/target_cumsum
        features_best_df['qval'] = fdr_to_q_values(fdr_values)
        q_val = features_best_df[features_best_df['qval'] <0.05 ]['qval'].values

        ids = np.arange(0, len(q_val), 1)
        axs[2].plot(q_val, ids)
        axs[2].set_xlim(-0.001, 0.05)
        axs[2].set_xlabel('q-value')
        axs[2].set_ylabel('number of precursors')
        axs[2].set_title("Identifications")
        
        fig.tight_layout()
        plt.show()

        print(len(features_best_df[features_best_df['qval'] <=0.01 ]['qval']))
        print(features_best_df[features_best_df['qval'] <=0.01 ]['proteins'].nunique())


        output_dfs.append(features_best_df[features_best_df['qval'] <=0.01])

    del dia_data

    stop_time = time.time()
    duration = stop_time - start_time

    duration_df = pd.DataFrame({'raw_name': [raw_name], 'duration': [duration]})
    duration_df.to_csv(os.path.join(output_location, f'{raw_name}_duration.tsv'), sep = '\t', index = False)

    output_dfs = pd.concat(output_dfs)
    output_dfs.to_csv(os.path.join(output_location, f'{raw_name}_d4_d8.tsv'), sep = '\t', index = False)
    

0:00:04.783458 [32;20m PROGRESS:       _   _      _         ___ ___   _   [0m
0:00:04.783974 [32;20m PROGRESS:      /_\ | |_ __| |_  __ _|   \_ _| /_\  [0m
0:00:04.784259 [32;20m PROGRESS:     / _ \| | '_ \ ' \/ _` | |) | | / _ \ [0m
0:00:04.784517 [32;20m PROGRESS:    /_/ \_\_| .__/_||_\__,_|___/___/_/ \_\[0m
0:00:04.784841 [32;20m PROGRESS:            |_|                            [0m
0:00:04.785075 [32;20m PROGRESS: [0m
0:00:04.785445 [38;20m INFO: loading default config from /Users/georgwallmann/Documents/git/alphadia/alphadia/extraction/../../misc/config/default.yaml[0m
0:00:04.791389 [38;20m INFO: Applying config update from dict[0m
0:00:04.791854 [32;20m PROGRESS: version: 1.0.2[0m
0:00:04.792121 [32;20m PROGRESS: hostname: Georgs-MacBook-Pro.local[0m
0:00:04.792444 [32;20m PROGRESS: date: 2023-05-25 00:13:04[0m
0:00:08.400297 [38;20m INFO: renaming precursor_columns columns[0m
0:00:08.401317 [38;20m INFO: renaming fragment_columns columns[0m
0:00:08.4