# Exploratory Analysis

In [None]:
%pip install acore

In [None]:
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import acore.exploratory_analysis as ea
from acore.types.exploratory_analysis import (
    AnnotationResult,
    TwoComponentSchema,
    TwoLoadingsSchema,
    TwoVariance,
)

Utility function for plotting

In [None]:
def make_plot(
    embeddings,
    x: str,
    y: str,
    annotation: Optional[dict[str, str]] = None,
    group: str = "group",
    **kwargs,
):
    """Utility function for static plot of dimensionality reductions."""
    fig, ax = plt.subplots()
    for i, (group, group_df) in enumerate(embeddings.groupby("group")):
        ax = group_df.rename(columns=map_names).plot.scatter(
            x=x,
            y=y,
            label=group,
            c=f"C{i}",
            ax=ax,
        )
    if annotation is not None:
        _ = ax.set(ylabel=annotation.y_title, xlabel=annotation.x_title)
    return fig, ax

## Load metabolomics example data

In [None]:
data = (
    "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/"
    "refs/heads/main/"
    "example_data/MTBLS13311/MTBLS13411_processed_data.csv"
)
data = pd.read_csv(data, index_col=0)
data

We add the group here based on the sample names. Alternatively you could merge it from
the avilable metadata.

In [None]:
data["group"] = data.index.str.split("-").str[0]
data["group"].value_counts()

## Principal Component Analysis (PCA)
Show first two principal components of the data.

In [None]:
# map_names gives the column names for the plot axes (which default to "x" and "y")
map_names = {
    "value": "feature_communiality",
    "x": "PC1",
    "y": "PC2",
}
results_dfs, annotation = ea.run_pca(
    data, drop_cols=[], annotation_cols=[], group="group", components=2, dropna=True
)
pcs, loadings, var_explained = results_dfs

See how much variance is explained by the first two components and validate that
they adhere to the expected format:

In [None]:
TwoVariance(pd.Series(var_explained, index=["PC1", "PC2"]))

Show the annotation information for plotting and validate that
they adhere to the expected format:

In [None]:
annotation = AnnotationResult(**annotation)
annotation

Make the PCA plot:

In [None]:
fig, ax = make_plot(pcs, annotation=annotation, **map_names)

Show what was computed and validate that
they adhere to the expected format:
- first two principal components of the samples
- loadings for the features on the first two components

We rename the columns for better readability.

In [None]:
TwoComponentSchema(pcs).rename(columns=map_names)

The feature communality of the loading is the absolute length of the projection.
So the features listed first here contribute the most to the two first components,
therefore driving the PCA separation.

In [None]:
TwoLoadingsSchema(loadings).rename(columns=map_names)

## Uniform Manifold Approximation and Projection (UMAP)
Visualize UMAP low-dimensional embedding of the data.
This uses the `umap-learn` package, which is documented with examples at
[umap-learn.readthedocs.io](https://umap-learn.readthedocs.io).

In [None]:
# map_names gives the column names for the plot axes (which default to "x" and "y")
map_names = {
    "x": "UMAP1",
    "y": "UMAP2",
}
result, annotation = ea.run_umap(
    data,
    drop_cols=["sample", "subject"],
    group="group",
    n_neighbors=10,
    min_dist=0.3,
    metric="cosine",
    dropna=True,
)

In [None]:
annotation = AnnotationResult(**annotation)
annotation

In [None]:
fig, ax = make_plot(result["umap"], annotation=annotation, **map_names)
TwoComponentSchema(result["umap"]).rename(columns=map_names)

Make sure to check the parameters and tutorials annotations in the API docs at
[umap-learn.readthedocs.io](https://umap-learn.readthedocs.io).

## Correlation analysis

### Coefficient of variation
Using masspectrometry data, we can compute the coefficient of variation on the non-log transformed
intensities. We do this for each group separately.
First we undo the log transformation, which is something specific to this dataset.

In [None]:
data_exp = data.drop(columns=["group"]).apply(lambda x: np.exp2(x)).join(data["group"])
data_exp

In [None]:
res = ea.get_coefficient_variation(data=data_exp, group="group")
res

In [None]:
res.describe()

In [None]:
map_names = {"x": "mean_log2", "y": "coef_of_var", "group": "group"}
fig, ax = make_plot(res, **map_names)