In [1]:
%reload_ext autoreload
%autoreload 2

from alpharaw.mzml import MzMLReader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from alphabase.spectral_library.base import SpecLibBase

In [2]:
root = Path("~//data/alphadia-validate/").expanduser()
path_search = root / 'output/precursors.tsv'
path_lib = root / 'output/speclib.hdf'
path_spectra = root / '20231017_OA2_TiHe_ADIAMA_HeLa_200ng_Evo011_21min_F-40_07.raw'

#### Obtain raw & results data
Note: this is just required if you did not run the search yourself in search_1.10.0.ipynb

In [3]:
from alphadia.data.alpharaw_wrapper import Thermo

precursor_df = pd.read_csv(path_search, sep='\t')
spectral_library = SpecLibBase()
spectral_library.load_hdf(path_lib)
dia_data =  Thermo(path_spectra)

49it [00:17,  2.85it/s]


In [4]:
from alphabase.spectral_library.flat import SpecLibFlat
spectral_library_flat = SpecLibFlat()
spectral_library_flat.parse_base_library(spectral_library)

In [5]:
spectral_library_flat.fragment_df

Unnamed: 0,mz,intensity,type,loss_type,charge,number,position
0,801.486877,0.125327,121,0,1,6,0
1,261.126740,0.335304,98,0,1,2,1
2,688.402832,1.000000,121,0,1,5,1
3,344.705048,0.049048,121,0,2,5,1
4,374.210815,0.282448,98,0,1,3,2
...,...,...,...,...,...,...,...
5465187,630.320557,0.124205,121,0,1,6,44
5465188,573.299072,0.029776,121,0,1,5,45
5465189,444.256500,0.078959,121,0,1,4,46
5465190,343.208832,0.125994,121,0,1,3,47


| Column | Description |
|--------|-------------|
| mz | Mass to charge ratio of the fragment ion |
| intensity | Relative intensity of the fragment ion |
| type | Type of fragment series. Use `chr(type)` to get type:<br>`a = 97, b = 98, c = 99, x = 120, y = 121, z = 122` |
| loss_type | Fragment loss type:<br>`0 = none, 18 = H2O, 17 = NH3` |
| charge | Charge of the fragment ion |
| number | Fragment series number in direction of series (one indexed).<br>Example: `type=121` and `number=1` would be `y1` |
| position | Position of fragment ion from N-terminus (zero indexed).<br>Example: `type=98` and `position=2` would be `b3` |

#### 2.2 Precursor data

The identified precursors following search are stored in the `precursor_df` DataFrame.
The precursors in this dataframe come from the spectral library but have aditional information on their identification.

The most important columns are:
- `mod_seq_charge_hash`: the hash of the precursor sequence and charge

The scans where they were identified called frames.
- `frame_start`: the frame number of the first frame in the run.
- `frame_stop`: the frame number of the last frame in the run

Furthermore there is the q-value and a multitude of scores that were used to identify the precursor.

In [6]:
precursor_df.head()

Unnamed: 0,base_width_mobility,base_width_rt,rt_observed,mobility_observed,mono_ms1_intensity,top_ms1_intensity,sum_ms1_intensity,weighted_ms1_intensity,weighted_mass_deviation,weighted_mass_error,...,_candidate_idx,valid,candidate_idx,run,mod_seq_hash,mod_seq_charge_hash,pg_master,pg,pg_qval,intensity
0,0.0,10.858948,215.645462,1e-06,5671702.0,5671702.0,13976840.0,4629540.0,1.317407,1.317407,...,387962,True,387962,20231017_OA2_TiHe_ADIAMA_HeLa_200ng_Evo011_21m...,6676861052106421843,6676861052106421845,Q04323,Q04323,0.0,6117868.5
1,0.0,10.866623,139.461288,1e-06,701773.4,701773.4,1141551.0,456954.4,-0.344464,0.344464,...,321953,True,321953,20231017_OA2_TiHe_ADIAMA_HeLa_200ng_Evo011_21m...,1446457374900626415,1446457374900626417,Q9H2G2,Q9H2G2,0.0,31296790.0
2,0.0,10.876144,242.052673,1e-06,2583778.0,2583778.0,5314894.0,1884685.0,0.569951,0.569951,...,376954,True,376954,20231017_OA2_TiHe_ADIAMA_HeLa_200ng_Evo011_21m...,12973433993431495764,12973433993431495766,Q29RF7,Q29RF7,0.0,12477771.0
3,0.0,10.86853,207.885986,1e-06,1231147.0,1231147.0,2963482.0,969063.9,1.002455,1.002455,...,430315,True,430315,20231017_OA2_TiHe_ADIAMA_HeLa_200ng_Evo011_21m...,8447969178672477057,8447969178672477059,P05198,P05198,0.0,81458184.0
4,0.0,10.909012,254.506622,1e-06,1012001.0,1012001.0,2033442.0,709540.1,0.35357,0.35357,...,359561,True,359561,20231017_OA2_TiHe_ADIAMA_HeLa_200ng_Evo011_21m...,3047295262910009981,3047295262910009983,Q8N954,Q8N954,0.0,3113382.0


#### 2.3 DIA data

Last, we have the raw DIA data objecrt loaded from the Thermo raw file.

This object contains all scans in the `dia_data.spectrum_df` DataFrame.
Each spectrum points to a collection of peak based on the `peak_start_idx` and `peak_stop_idx` columns.

These point to the `dia_data.peak_df` DataFrame, which contains the peak information.

In [7]:
dia_data.spectrum_df.head()

Unnamed: 0,spec_idx,peak_start_idx,peak_stop_idx,rt,precursor_mz,precursor_charge,isolation_lower_mz,isolation_upper_mz,ms_level,nce
0,0,0,35,0.0,-1.0,0,-1.0,-1.0,1,0.0
1,1,35,42,0.000439,385.42508,0,384.424625,386.425535,2,25.0
2,2,42,43,0.000521,389.42691,0,388.426455,390.427365,2,25.0
3,3,43,44,0.000601,395.42963,0,394.429175,396.430085,2,25.0
4,4,44,45,0.000694,397.43054,0,396.430085,398.430995,2,25.0


In [8]:
dia_data.peak_df.head()

Unnamed: 0,mz,intensity
0,459.638519,13105.375977
1,470.064972,10583.207031
2,472.338043,10462.697266
3,479.018433,10998.292969
4,507.042847,13371.03418


#### 3 Map precursor hit from search results to raw data

Using this information we can map the identified precursors to the raw data.

We will use the `get_library_entry_by_hash` function to get the library entry for a given hash.

This function returns the library entry, the fragment m/z values and the fragment intensities.



In [9]:
hash = precursor_df['mod_seq_charge_hash'].iloc[3000]

def get_library_entry_by_hash(speclib, hash, min_intensity=0.01):
    speclib_entry = speclib.precursor_df[speclib.precursor_df['mod_seq_charge_hash'] == hash].iloc[0]

    fragment_mz = speclib.fragment_mz_df.iloc[speclib_entry.frag_start_idx:speclib_entry.frag_stop_idx].to_numpy().flatten()
    fragment_intensity = speclib.fragment_intensity_df.iloc[speclib_entry.frag_start_idx:speclib_entry.frag_stop_idx].to_numpy().flatten()
    fragment_mask = fragment_intensity > min_intensity

    fragment_mz = fragment_mz[fragment_mask]
    fragment_intensity = fragment_intensity[fragment_mask]

    # sort both by mz
    fragment_order = np.argsort(fragment_mz)
    fragment_mz = fragment_mz[fragment_order]
    fragment_intensity = fragment_intensity[fragment_order]

def get_flat_library_entry_by_hash(speclib_flat, hash, min_intensity=0.01):
    speclib_entry = speclib_flat.precursor_df[speclib_flat.precursor_df['mod_seq_charge_hash'] == hash].iloc[0]

    flat_frag_start_idx = speclib_entry.flat_frag_start_idx
    flat_frag_stop_idx = speclib_entry.flat_frag_stop_idx

    fragment_mz = speclib_flat.fragment_df['mz'].iloc[flat_frag_start_idx:flat_frag_stop_idx].to_numpy().flatten()
    fragment_intensity = speclib_flat.fragment_df['intensity'].iloc[flat_frag_start_idx:flat_frag_stop_idx].to_numpy().flatten()
    fragment_mask = fragment_intensity > min_intensity

    fragment_mz = fragment_mz[fragment_mask]
    fragment_intensity = fragment_intensity[fragment_mask]

    # sort both by mz
    fragment_order = np.argsort(fragment_mz)
    fragment_mz = fragment_mz[fragment_order]
    fragment_intensity = fragment_intensity[fragment_order]

    return speclib_entry, fragment_mz, fragment_intensity

speclib_entry, mz_library, intensity_library = get_flat_library_entry_by_hash(spectral_library_flat, hash)
precursor_entry = precursor_df[precursor_df['mod_seq_charge_hash'] == hash].iloc[0]

In [10]:
jit_data = dia_data.jitclass()

precursor_query = np.array([[speclib_entry.precursor_mz, speclib_entry.precursor_mz]], dtype=np.float32)
scan_limits = np.array([[precursor_entry.scan_start, precursor_entry.scan_stop, 1]], dtype=np.int64)
frame_limits = np.array([[precursor_entry.frame_start, precursor_entry.frame_stop, 1]], dtype=np.int64)

dense, precursor_index = jit_data.get_dense(
    frame_limits,
    scan_limits,
    mz_library,
    30,
    precursor_query,
)

#### 4 Visualize precursor data

Now, we want to viosualize the retrieved spectrum data.
We will start by visualizing the observed spectrum and the library spectrum.

The spectrum data `dense` is a 5 dimensional numpy array with a dense slice of the spectrum space.
The dimensions are:
- 0: either intensity information 0 or relative mass error 1
- 1: index of the fragment mz which was queried
- 2: ion mobility dimension (will be zero for DIA data)
- 3: The observations in the DIA cycle. As there might be multiple quadrupole windows where the precursor was detected, this will be a list of observations.
- 4: Retention time datapoints.

First we will select the intensity dimension and sum over all other dimensions but the fragment mz dimension.


Finally, we will visualize the Precusor ion chromatogram.
We will again select the intensity dimension and sum over ion mobility and observation but leave the retention time dimension.


In [14]:
import altair as alt

In [39]:
def get_xic_df(dense):
    xic = dense[0].sum(axis=(1,2))
    df = pd.DataFrame(xic)
    
    df_long = df.reset_index().melt(
        id_vars='index',
        var_name='Scan',
        value_name='Intensity'
    )
    
    df_long = df_long.rename(columns={'index': 'Series'})
    #df_long['Intensity'] = np.sqrt(df_long.Intensity)
    return df_long

# Create the DataFrame
df_xic = get_xic_df(dense)

# Create chart with explicit line grouping
chart = alt.Chart(df_xic).mark_line().encode(
    x=alt.X('Scan:Q', title='Scan Number'),
    y=alt.Y('Intensity:Q', title='log Intensity'),
    color='Series:N',
    detail='Series:N' 
).properties(
    width=600,
    height=400
)
chart