# Basic `alphatools` workflow

1. load data and meta data
2. basic minimal preprocessing
3. perform PCA
4. plot PCA embeddings, variance and loadings

In [None]:
%load_ext autoreload
%autoreload 2

import logging
from anndata import AnnData
import numpy as np
import pandas as pd

import alphatools as at
from alphatools.pl.figure import create_figure, label_axes
from alphatools.pl.plots import Plots
from alphatools.pp.embeddings import pca
from alphatools.pp.metadata import add_core_proteome_mask


logging.basicConfig(level=logging.INFO)

## Load the data 

Dataset taken from datashare folder: https://datashare.biochem.mpg.de/s/sSYkOj22kM5AJ4O

In [None]:
#### Change after merging, and use Lucas' code to load the DIA-NN data

# load protein groups into an AnnData object with index & columns as obs & var
url_data = "https://datashare.biochem.mpg.de/public.php/dav/files/NGxD86CQSbRWmzw/report.pg_matrix.tsv"
df = pd.read_csv(url_data, sep="\t")

url_metadata = "https://datashare.biochem.mpg.de/public.php/dav/files/sSYkOj22kM5AJ4O/simple_metadata.csv"
md = pd.read_csv(url_metadata, sep=",")

adata = AnnData(
    X=df.drop(columns=["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"]).to_numpy().T,
    obs=md,
    var=df[["Protein.Group", "Protein.Names", "Genes", "First.Protein.Description"]].set_index("Protein.Group"),
)

# overview of the data object
print(adata)

## Basic EDA on a synthetic example dataset:

1. Generate example data
2. Filter for data completeness on sample level
3. Visualize samples as histograms
4. Save data

### Filter by data completeness:

Remove features which have more than the allowed fraction of missing values

In [None]:
print("The numeric data in the anndata object:")
display(adata.to_df().head())

print("The sample-level metadata in the anndata object:")
display(adata.obs.head())

print("The feature-level metadata in the anndata object:")
display(adata.var.head())

#  filter out features with more than 25 % missing values
print("Before filtering, the shape of the anndata object: ", adata.shape)
adata = at.pp.filter_data_completeness(adata=adata, max_missing=0.25)
print("After filtering, the shape of the anndata object: ", adata.shape)

print("The numeric data in the anndata object:")
display(adata.to_df().head())

print("The sample-level metadata in the anndata object:")
display(adata.obs.head())

print("The feature-level metadata in the anndata object:")
display(adata.var.head())

## Creating new layers prior to preprocessing

This way, we can save the raw data and try different pp steps on the raw data.

In [None]:
# save the raw data before log transformation
adata.layers["raw"] = adata.X.copy()

# log2 transform the data
adata.X = np.log2(adata.X + 1)

### Visualize the distribution of values in different levels of an observational metadata variable

In this example, check the distribution of "gene_1" expression values per cell type.

In [None]:
# Apply the AxisManager to make axes iterable and apply consistent AlphaTools styling.
# Axes can also be accessed directly by indexing the axm object.
fig, axm = create_figure(nrows=1, ncols=2, figsize=(5, 3))

# Plot.histogram handles adata natively. Columns from the data and metadata are accessible
# Focus on the distribution of protein A1L0T0
ax = axm.next()
Plots.histogram(
    data=adata,
    value_column="A1L0T0",
    bins=20,
    legend="auto",
    ax=ax,
    hist_kwargs={"alpha": 0.5, "histtype": "stepfilled", "linewidth": 0.5, "edgecolor": "black"},
)
label_axes(ax, "A1L0T0", "Frequency", "Distribution of A1L0T0")

# Focus on the distribution of protein A1L0T0 in the different replicates
ax = axm.next()
Plots.histogram(
    data=adata,
    value_column="A1L0T0",
    color_map_column="replicate",
    bins=20,
    legend="auto",
    ax=ax,
    hist_kwargs={"alpha": 0.5, "histtype": "stepfilled", "linewidth": 0.5, "edgecolor": "black"},
)
label_axes(ax, "A1L0T0", "Frequency", "Distribution of A1L0T0 by replicate")

# # save figure
# save_figure(
#     fig=fig,
#     filename="sample_histogram.png",
#     output_dir=output_directory,
#     dpi=300,
#     transparent=False,
# )

### Running PCA

Before running PCA, we need to filter out NaN values. PCA can not be computed on matrices with missing values. Therefore, prior to PCA, we will create a list of 'core proteins' of proteins detected in all observations, save it in the feature meta data frame (adata.var)

In [None]:
# add a new column to the adata.var object with the name "isCore" to indicate whether the feature is part of the core proteome
add_core_proteome_mask(adata, layer="raw", new_column_name="is_core")

# view hoe many features are part of the core proteome
print("The number of features in the core proteome:")
print(adata.var["is_core"].value_counts())

Now we can run PCA, specifying the adata.var column that filters the proteins by 100% completeness:

In [None]:
# this function is now implemented on sample level (PCA of the observations).
pca(adata, meta_data_mask_column_name="is_core", n_comps=10)

# view the PCA results
print("The dimensions of PC coordinates in the adata.obsm are (n_obs x n_comp):")
print(adata.obsm["X_pca_obs"].shape)
print("The PCA loadings in the adata.varm are (n_var x n_comp):")
print(adata.varm["PCs_obs"].shape)
print("Ratio of explained variance (n_comp):")
print(adata.uns["variance_pca_obs"]["variance_ratio"])
print("The explained variance (n_comp):")
print(adata.uns["variance_pca_obs"]["variance"])

In addition to running PCA to get a dimentional reduction of the observations (samples), we can also perform PCA on the features (proteins). 

In [None]:
# Now run PCA on the protein space to get their projection in the PCA space.
pca(adata, meta_data_mask_column_name="is_core", n_comps=10, dim_space="var")

# view the PCA results for features
print("----- PCA ON FEATURES -----")
print("The dimensions of PC coordinates in the adata.varm are (n_obs x n_comp):")
print(adata.varm["X_pca_var"].shape)
print("The PCA loadings of the samples in the adata.obsm are (n_var x n_comp):")
print(adata.obsm["PCs_var"].shape)
print("Ratio of explained variance (n_comp):")
print(adata.uns["variance_pca_var"]["variance_ratio"])
print("The explained variance (n_comp):")
print(adata.uns["variance_pca_var"]["variance"])

### Plot PCA results 
We can plot the PCA results on a 2D projection, look at the explained var in each PC using the scree plot, and plot the loadings od the PCs, either per PC or a scatter of 2 PCs, to understand their 'drivers'. 

In [None]:
fig, axm = create_figure(2, 2, figsize=(12, 12))

ax = axm.next()
# PCA plot colored by replicate
Plots.plot_pca(
    data=adata,
    ax=ax,
    pc_x=1,
    pc_y=2,
    label=False,
    label_column=None,
    embbedings_name=None,
    color_map_column="replicate",
)

# scree plot to show the explained variance by each PC
ax = axm.next()
Plots.scree_plot(adata=adata, ax=ax, n_pcs=50)

# top loadings of the first PC
ax = axm.next()
Plots.plot_pca_loadings(
    data=adata,
    ax=ax,
    dim=1,
    nfeatures=20,
)

# 2d loading plot with highlighted top 20 loadings
ax = axm.next()
Plots.plot_pca_loadings_2d(
    data=adata,
    ax=ax,
    pc_x=1,
    pc_y=2,
    nfeatures=20,
    add_labels=True,
    add_lines=True,
    scatter_kwargs=None,
)

### Plot PCA results for feature PCA 
Just like the PCA on the samples, we can plot the same plots for the results of PCA calculated on the features.  

In [None]:
# now produce the PCAs plot for the features
fig, axm = create_figure(2, 2, figsize=(12, 12))

ax = axm.next()
Plots.plot_pca(
    data=adata,
    ax=ax,
    pc_x=1,
    pc_y=2,
    dim_space="var",
    label=False,
    label_column=None,
    embbedings_name=None,
)

ax = axm.next()
Plots.scree_plot(adata=adata, ax=ax, n_pcs=50, dim_space="var")

ax = axm.next()
Plots.plot_pca_loadings(data=adata, ax=ax, dim=1, nfeatures=10, dim_space="var")

ax = axm.next()
Plots.plot_pca_loadings_2d(
    data=adata,
    ax=ax,
    pc_x=1,
    pc_y=2,
    nfeatures=10,
    add_labels=True,
    add_lines=True,
    scatter_kwargs=None,
    dim_space="var",
)

### UMAP Visualization with Scanpy

To explore and visualize the high-dimensional proteomics data, we use **UMAP (Uniform Manifold Approximation and Projection)** as implemented in **Scanpy**. UMAP projects complex, high-dimensional feature spaces into a lower-dimensional space (typically 2D) while preserving the local and global structure of the data. This allows us to identify clusters, relationships, and potential outliers in the proteomic profiles at a glance.

In this notebook, Scanpyâ€™s `sc.pp.neighbors()` and `sc.tl.umap()` functions are applied to the processed data matrix to compute a nearest-neighbor graph and then generate UMAP coordinates. We will use the sample PCA matrix for neighbor calculations. The resulting UMAP embedding provides an intuitive visualization of sample similarity and grouping based on proteomic features, complementing downstream analyses such as clustering or differential expression.


In [None]:
import scanpy as sc

sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_pca_obs")  # use the PCA results on samples
sc.tl.umap(adata)

In [None]:
# location of the umap coordinates in the adata.obsm
print("The UMAP coordinates in the adata.obsm are in adata.obsm['X_umap'] with shape: ", adata.obsm["X_umap"].shape)
print(adata.obsm["X_umap"])

### Plot UMAP
We can either plot the UMAP results using scanpy's plotting function, or we can use AlphaTools plotting function, with adding the umap coordinates directly to the obs df.


In [None]:
# scanpy's plotting function
sc.pl.umap(adata, color="replicate", size=50)  # the size is usually much smaller

Another option is to copy the coordinates into the `adata.obs` data frame, to plot in using `scatter` function in `alphatools` package

In [None]:
adata.obs["UMAP1"] = adata.obsm["X_umap"][:, 0]
adata.obs["UMAP2"] = adata.obsm["X_umap"][:, 1]

fig, axm = create_figure(1, 1, figsize=(5, 5))
ax = axm.next()
Plots.scatter(adata, x_column="UMAP1", y_column="UMAP2", color_map_column="replicate", ax=ax, legend="auto")

label_axes(
    ax=ax,
    xlabel="UMAP_1",
    ylabel="UMAP_2",
    title="UMAP colored by replicate (Alphatools implementation)",
)