# AlphaViz tutorial

This tutorial covers the basics of using AlphaViz as a Python package. It includes the following sections:
1. [**Setup**](#ISetup)
2. [**Data upload**](#Data-upload)
3. [**Analysis**](#Analysis)  
    a) [**Protein level**](#Protein-level)  
    b) [**Peptide level**](#Peptide-level)
4. [**Prediction of theoretical mass spectrum**](#Prediction-of-theoretical-mass-spectrum)
5. [**Quality control of the entire sample**](#Quality-control-of-the-entire-sample)

# Setup

### Import all necessary libraries

In [None]:
import os
import logging
import pandas as pd
from io import StringIO

import alphatims.bruker
import alphatims.utils

# visualization
import panel as pn
import bokeh.server.views.ws
from bokeh.models.widgets.tables import NumberFormatter
import holoviews as hv

# local
import alphaviz
import alphaviz.utils
import alphaviz.io
import alphaviz.preprocessing
import alphaviz.plotting
import alphaviz.gui

pd.set_option('display.max_columns', None)

To save the Plotly plots as .svg, you need to add this code with the name of the file inside the show function:
`.show(config=alphaviz.gui.update_config(filename='elution_profile.svg'))` and when the plot is shown, click the "Download plot" button on the Plotly tool panel.  

You can also specify the height and width of the saved plot. See example for the Chromatogram plot.

### Set paths to raw data, software analysis output folder (MaxQuant), fasta file

In [None]:
# path to the .d folder or .hdf file
experimental_file = '/Users/eugeniavoytik/Projects/DIANN_peptides_per_protein/1.ddaPASEF_from_onlinePASEF_paper/20190504_TIMS1_FlMe_SA_HeLa_frac01_A10_1_93.hdf'
# path to the MQ output folder
mq_output_folder = '/Users/eugeniavoytik/Projects/DIANN_peptides_per_protein/1.ddaPASEF_from_onlinePASEF_paper/txt'
# path to the fasta file
fasta_file = '/Users/eugeniavoytik/copied/Bruker/MaxQuant_output_tables/20210413_TIMS03_EVO03_PaSk_MA_HeLa_200ng_S1-A1_1_24848.d/txt/human.fasta'

# Data upload

### 1) Load the raw file

In [None]:
# Bruker
raw_data = alphatims.bruker.TimsTOF(experimental_file)

### 2) Load the files from the MaxQuant output folder needed for visualization

In [None]:
all_peptides, msms, evidence, protein_groups, summary = alphaviz.io.import_mq_output(
    ['allPeptides.txt', 'msms.txt', 'evidence.txt', 'proteinGroups.txt', 'summary.txt'],
    mq_output_folder,
    os.path.basename(experimental_file).split('.')[0]
)

### 3) Load the fasta file

In [None]:
fasta = alphaviz.io.read_fasta(fasta_file)

# Analysis

To start the analysis, show the "Chromatograms" plot that visualises the total ion chromatograms and the base peak chromatograms for MS1 and MS2 data.

In [None]:
chromatograms_plot = alphaviz.plotting.plot_chrom(raw_data, colorscale_qualitative="Pastel")
chromatograms_plot.show(config=alphaviz.gui.update_config(filename='chromatograms.svg', height=400, width=600))

## Protein level

To assess the quality of each protein individually, provide the gene name of the protein.

In [None]:
gene_name = 'IFRD1'

In [None]:
# get the sequence of the specified protein from the fasta file
protein_seq = alphaviz.preprocessing.get_aa_seq(
    protein_groups[protein_groups['Gene names'] == gene_name]['Protein IDs'].values[0],
    fasta,
)

Here you can see a filtered data frame containing information about **all peptides** identified for the selected protein sorted bu the Andromeda score.

In [None]:
peptides_table = evidence[evidence['Gene names'] == gene_name]
peptides_table = peptides_table.sort_values('Andromeda score', ascending=False)
peptides_table

To explore the position of identified peptides on a protein sequence, plot protein coverage for all peptides simultaneously or just for selected peptide (see below).

In [None]:
protein_coverage_plot = alphaviz.plotting.plot_sequence_coverage(
    protein_seq,
    gene_name,
    peptides_table['Modified sequence'].tolist(),
    colorscale_qualitative="Plotly", 
    colorscale_sequential="Viridis",
    regex=r"\[([^]]+)\]|\((\w+)\)",
)
protein_coverage_plot

## Peptide level

From this point onwards, we are going to assess the individual quality of each peptide.

In [None]:
# specify the index of the peptide from the peptides table that you'd like to explore further 
selected_peptide_index = 187773

In [None]:
protein_coverage_plot_one_peptide = alphaviz.plotting.plot_sequence_coverage(
    protein_seq,
    gene_name,
    [peptides_table.loc[selected_peptide_index, 'Modified sequence']],
    colorscale_qualitative="Alphabet", 
    colorscale_sequential="Viridis",
    regex=r"\[([^]]+)\]|\((\w+)\)",
)
protein_coverage_plot_one_peptide

In [None]:
# some preprocessing steps
scan_number = [int(scan) for scan in [peptides_table.loc[selected_peptide_index, 'MS/MS scan number']]]
pasef_ids = [int(pasef_id) for pasef_id in all_peptides[all_peptides['MS/MS scan number'].isin(scan_number)]['Pasef MS/MS IDs'].values[0]]
precursors = raw_data.fragment_frames[raw_data.fragment_frames.index.isin(pasef_ids)]
merged_precursor_data = pd.merge(
    precursors, raw_data.precursors[raw_data.precursors.Id.isin(precursors.Precursor.values)],
    left_on='Precursor',
    right_on='Id'
)
merged_precursor_data['Frame_Prec'] = list(zip(merged_precursor_data.Frame, merged_precursor_data.Precursor))
ms1_ms2_frames = dict(zip(merged_precursor_data.Parent, merged_precursor_data.Frame_Prec))
current_frame = list(ms1_ms2_frames.keys())[0]

In [None]:
# information about the MS1 frames as keys and (MS2 frames and precursor ID) as values
ms1_ms2_frames

Specify the tolerance settings:

In [None]:
xic_tol_value = 15 # in ppm
xic_im_tol = 0.05 # in 1/K0
rt_tolerance = 30 # sec

Slice the raw data based on the precursor mz, m/z and im tolerance and visualize the extracted ion chromatogram (XIC), mobilogram and MS1 spectrum for the selected peptide.

In [None]:
prec_mono_mz = merged_precursor_data.MonoisotopicMz.median()
prec_mono_low_mz = prec_mono_mz / (1 + xic_tol_value / 10**6)
prec_mono_high_mz = prec_mono_mz * (1 + xic_tol_value / 10**6)
rt_low_sec = float(peptides_table.loc[selected_peptide_index, 'Retention time'])*60 - rt_tolerance
rt_upp_sec = float(peptides_table.loc[selected_peptide_index, 'Retention time'])*60 + rt_tolerance

for x_axis_label in ['rt', 'mobility', 'mz']:
    if x_axis_label == 'rt':
        one_over_k0 = float(peptides_table.loc[selected_peptide_index, '1/K0'])
        one_over_k0_low, one_over_k0_high = one_over_k0 - xic_im_tol, one_over_k0 + xic_im_tol
        precursor_indices = raw_data[
            :,
            one_over_k0_low : one_over_k0_high,
            :,
            prec_mono_low_mz : prec_mono_high_mz,
            'raw'
        ]
    elif x_axis_label == 'mobility':
        precursor_indices = raw_data[
            rt_low_sec:rt_upp_sec,
            :,
            :,
            prec_mono_low_mz : prec_mono_high_mz,
            'raw'
        ]
    else:
        precursor_indices = raw_data[
            current_frame,
            'raw'
        ]
    alphaviz.plotting.plot_line(
        raw_data,
        precursor_indices,
        x_axis_label,
        colorscale_qualitative="Pastel",
    ).show()        

Visualize the MS1 or MS2 frame for the selected peptide with the location where the precursor has been selected for analysis.

In [None]:
hv.extension('bokeh')

In [None]:
ms1 = alphaviz.plotting.plot_heatmap(
    raw_data[current_frame],
    mz=float(peptides_table.loc[selected_peptide_index, 'm/z']),
    im=float(peptides_table.loc[selected_peptide_index, '1/K0']),
    x_axis_label='m/z, Th',
    y_axis_label='Inversed IM, V·s·cm\u207B\u00B2',
    title=f'MS1 frame(s) #{current_frame}',
    colormap='fire',
    background_color='black',
    precursor_size=15,
    precursor_color='blue',
    width=600
)
ms1

In [None]:
ms2 = alphaviz.plotting.plot_heatmap(
    raw_data[ms1_ms2_frames[current_frame][0]],
    x_axis_label='m/z, Th',
    y_axis_label='Inversed IM, V·s·cm\u207B\u00B2',
    title=f'MS2 frame(s) #{ms1_ms2_frames[current_frame][0]}',
    colormap='fire',
    background_color='black',
    width=600,
)
ms2

Each of the heatmap can be saved as .svg:

In [None]:
alphaviz.plotting.export_svg(ms2, 'plot_ms2_heatmap.svg', height=600, width=600)

For each of the MS2 frames where the peptide was identified, visualize the MS2 spectrum with a mass error plot for each ion and annotated peptide sequence as subplots.

In [None]:
for each in ms1_ms2_frames.keys():

    data_ions = alphaviz.preprocessing.get_mq_ms2_scan_data(
        msms,
        scan_number[0],
        raw_data,
        ms1_ms2_frames[each][1]
    )

    alphaviz.plotting.plot_complex_ms_plot(
        data_ions,
        title=f'MS2 spectrum for Precursor: {ms1_ms2_frames[current_frame][1]}',
        sequence=peptides_table.loc[selected_peptide_index, 'Sequence'],
    ).show()


# Prediction of theoretical mass spectrum

In [None]:
import peptdeep
from peptdeep.pretrained_models import ModelManager

model_mgr = ModelManager()
model_mgr.load_installed_models()

Load the evidence file in the form necessary for the prediction:

In [None]:
from alphabase.io.psm_reader import psm_reader_provider

mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.load(mq_output_folder + '/evidence.txt')
psm_df = mq_reader.psm_df.groupby(['sequence','mods','mod_sites','nAA','charge','spec_idx'])['ccs'].median().reset_index()
psm_df['nce'] = 0.3
psm_df['instrument'] = 'timsTOF'
psm_df.head()

To run the prediction for one peptide, slice the data based on the spec_idx and its sequence.

In [None]:
data_slice = psm_df.loc[(psm_df.sequence == peptides_table.loc[selected_peptide_index, 'Sequence'])&(psm_df.spec_idx == peptides_table.loc[selected_peptide_index, 'MS/MS scan number'])]
data_slice

Run the intensity prediction:

In [None]:
predlib = model_mgr.predict_all(data_slice, predict_items=['ms2'], frag_types=['b_z1', 'y_z1'], multiprocessing=False)

In [None]:
mz_ions = predlib['fragment_mz_df']

intensities_ions = predlib['fragment_intensity_df']
intensities_ions *= -100

predicted_df = pd.DataFrame(columns=['FragmentMz', 'RelativeIntensity','ions'])

predicted_df['FragmentMz'] = mz_ions.b_z1.values.tolist() + mz_ions.y_z1.values.tolist()[::-1]
predicted_df['RelativeIntensity'] = intensities_ions.b_z1.values.tolist() + intensities_ions.y_z1.values.tolist()[::-1]
predicted_df['ions'] = [f"b{i}" for i in range(1, len(mz_ions.b_z1)+1)] + [f"y{i}" for i in range(1, len(mz_ions.y_z1)+1)]

predicted_df.head()

In [None]:
# for the first MS2 frame
ms1_frame = 46538

data_ions = alphaviz.preprocessing.get_mq_ms2_scan_data(
    msms,
    scan_number[0],
    raw_data,
    ms1_ms2_frames[ms1_frame][1]
)

alphaviz.plotting.plot_complex_ms_plot(
    data_ions,
    title=f'MS2 spectrum for Precursor: {ms1_ms2_frames[ms1_frame][1]}',
    sequence=peptides_table.loc[selected_peptide_index, 'Sequence'],
    predicted=(predicted_df.FragmentMz, predicted_df.RelativeIntensity, predicted_df.ions)
)

# Quality control of the entire sample

Here you can find several quality control plots for the entire sample. 

In [None]:
uncalb_mass_dens_plot = alphaviz.plotting.plot_mass_error(
    evidence,
    'm/z',
    'Uncalibrated mass error [ppm]',
    'Uncalibrated mass density plot'
)
uncalb_mass_dens_plot

In [None]:
peptide_per_protein_distr = alphaviz.plotting.plot_pept_per_protein_barplot(
    protein_groups,
    '(EXP) # peptides',
    'Peptides per protein',
)
peptide_per_protein_distr

In [None]:
peptide_mz_distr = alphaviz.plotting.plot_peptide_distr(
    evidence,
    'm/z',
    'Peptide m/z distribution'
)
peptide_mz_distr

In [None]:
peptide_length_distr = alphaviz.plotting.plot_peptide_distr(
   evidence,
    'K0 length',
    'Peptide length distribution'
)
peptide_length_distr