# Exploratory Analysis

In [None]:
%pip install acore

In [None]:
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import acore.correlation_analysis as ca
import acore.exploratory_analysis as ea
from acore.types.exploratory_analysis import (
    AnnotationResult,
    TwoComponentSchema,
    TwoLoadingsSchema,
    TwoVariance,
)

Utility function for plotting

In [None]:
def make_plot(
    embeddings,
    x: str,
    y: str,
    annotation: Optional[dict[str, str]] = None,
    group: str = "group",
    **kwargs,
):
    """Utility function for static plot of dimensionality reductions."""
    fig, ax = plt.subplots()
    for i, (group, group_df) in enumerate(embeddings.groupby("group")):
        ax = group_df.rename(columns=map_names).plot.scatter(
            x=x,
            y=y,
            label=group,
            c=f"C{i}",
            ax=ax,
        )
    if annotation is not None:
        _ = ax.set(ylabel=annotation.y_title, xlabel=annotation.x_title)
    return fig, ax

## Load metabolomics example data

In [None]:
data = (
    "https://raw.githubusercontent.com/Multiomics-Analytics-Group/acore/"
    "refs/heads/main/"
    "example_data/MTBLS13311/MTBLS13411_processed_data.csv"
)
data = pd.read_csv(data, index_col=0)
# specific to this data, we shorten some column names for better readability
data.columns = data.columns.str.split("(").str[-1].str.replace(")", "")
data

We add the group here based on the sample names. Alternatively you could merge it from
the avilable metadata.

In [None]:
data["group"] = data.index.str.split("-").str[0]
data["group"].value_counts()

## Principal Component Analysis (PCA)
Show first two principal components of the data.

In [None]:
# map_names gives the column names for the plot axes (which default to "x" and "y")
map_names = {
    "value": "feature_communiality",
    "x": "PC1",
    "y": "PC2",
}
results_dfs, annotation = ea.run_pca(
    data, drop_cols=[], annotation_cols=[], group="group", components=2, dropna=True
)
pcs, loadings, var_explained = results_dfs

See how much variance is explained by the first two components and validate that
they adhere to the expected format:

In [None]:
TwoVariance(pd.Series(var_explained, index=["PC1", "PC2"]))

Show the annotation information for plotting and validate that
they adhere to the expected format:

In [None]:
annotation = AnnotationResult(**annotation)
annotation

Make the PCA plot:

In [None]:
fig, ax = make_plot(pcs, annotation=annotation, **map_names)

Show what was computed and validate that
they adhere to the expected format:
- first two principal components of the samples
- loadings for the features on the first two components

We rename the columns for better readability.

In [None]:
TwoComponentSchema(pcs).rename(columns=map_names)

The feature communality of the loading is the absolute length of the projection.
So the features listed first here contribute the most to the two first components,
therefore driving the PCA separation.

In [None]:
TwoLoadingsSchema(loadings).rename(columns=map_names)

## Uniform Manifold Approximation and Projection (UMAP)
Visualize UMAP low-dimensional embedding of the data.
This uses the `umap-learn` package, which is documented with examples at
[umap-learn.readthedocs.io](https://umap-learn.readthedocs.io).

In [None]:
# map_names gives the column names for the plot axes (which default to "x" and "y")
map_names = {
    "x": "UMAP1",
    "y": "UMAP2",
}
result, annotation = ea.run_umap(
    data,
    drop_cols=["sample", "subject"],
    group="group",
    n_neighbors=10,
    min_dist=0.3,
    metric="cosine",
    dropna=True,
)

In [None]:
annotation = AnnotationResult(**annotation)
annotation

In [None]:
fig, ax = make_plot(result["umap"], annotation=annotation, **map_names)
TwoComponentSchema(result["umap"]).rename(columns=map_names)

Make sure to check the parameters and tutorials annotations in the API docs at
[umap-learn.readthedocs.io](https://umap-learn.readthedocs.io).


## Coefficient of variation
Using masspectrometry data, we can compute the coefficient of variation on the
non-log transformed intensities. We do this for each group separately.
First we undo the log transformation, which is something specific to this dataset.

In [None]:
data_exp = data.drop(columns=["group"]).apply(lambda x: np.exp2(x)).join(data["group"])
data_exp

In [None]:
res = ea.get_coefficient_variation(data=data_exp, group="group")
res

In [None]:
res.describe()

In [None]:
map_names = {"x": "mean_log2", "y": "coef_of_var", "group": "group"}
fig, ax = make_plot(res, **map_names)

## Correlation analysis
See [`acore.correlation_analysis`](acore.correlation_analysis) for more functions
and details.

The basic functionality is built into pandas, but you need to filter out columns
which are not numeric for pearson correlation.

Generally: Ordered categorical values can be used, assuming equal spacing between
the categories. Otherwise, continous numeric values are required.

In [None]:
corr = data.drop(columns=["group"]).corr(method="pearson")
corr

Plot the correlation heatmap using seaborn

In [None]:
plt.rcParams["xtick.labelsize"], plt.rcParams["ytick.labelsize"] = 5, 5
fig, ax = plt.subplots(figsize=(7.1, 6))
heatmap = sns.heatmap(
    corr,
    cmap="vlag",
    center=0,
    square=True,
    linewidths=0.1,
    cbar_kws={"label": "Pearson r"},
    ax=ax,
)
ax.set(title="Correlation Heatmap")
fig.tight_layout()

In [None]:
# If you only want to keep the lower triangle of the correlation matrix to have
# unique values of interst, you can use the utility function:

In [None]:
lower_corr = ca.corr_lower_triangle(data.drop(columns=["group"]), method="pearson")
lower_corr

Plot the lower triangle correlations as a histrogram to see the distribution of
correlation values

In [None]:
ax = lower_corr.stack().plot.hist(
    bins=50,
    grid=False,
    figsize=(6, 4),
    title="Distribution of Pearson correlation values",
    xlabel="Pearson r",
    ylabel="Frequency",
    xlim=(-1.02, 1.02),
)

or to find the strongest correlations, which you might want to filter further for
uninteresting correlation between redundant features.

In [None]:
lower_corr_stack = lower_corr.stack()
idx_largerst_corr = lower_corr_stack.abs().sort_values(ascending=False).head(20).index
lower_corr_stack.loc[idx_largerst_corr]

This function can be used to compute multiple correlation methods at once
and compare them, here for the first four features.

It only works on numeric values.

In [None]:
corr = list()
for method in ["pearson", "spearman", "kendall"]:
    _corr = (
        ca.corr_lower_triangle(data.iloc[:, :4], method=method, numeric_only=True)
        .stack()
        .rename(method)
    )
    corr.append(_corr)
corr = pd.concat(corr, axis=1).sort_values(by="pearson", ascending=True)
corr.plot(
    style=".",
    ylim=(-1.05, 1.05),
    alpha=0.5,
    rot=45,
)

Filtering correlations based on p-values with multiple testing correction
- the p-value depends on the number of samples
- and the strenght of the correlation

In [None]:
res = ca.calculate_correlations(data.iloc[:, 0], data.iloc[:, 1], method="pearson")
print(res)

For the first four features, we would only keep one significant correlation
after multiple testing correction with the Benjamini-Hochberg method.

In [None]:
correlation = ca.run_correlation(
    data.iloc[:, :4], alpha=0.05, group="group", method="pearson", correction="fdr_bh"
)
correlation

The efficient correlation calculation can be used to compute the correlation
matrix and p-value matrix for larger datasets.

In [None]:
corr, p = ca.run_efficient_correlation(data.iloc[:, :4], method="spearman")
pd.DataFrame(p)

you can verify the results against [`scipy.stats.spearmanr`](scipy.stats.spearmanr)

In [None]:
r, p = scipy.stats.spearmanr(data.iloc[:, :4])
pd.DataFrame(p)

same for pearson correlation

In [None]:
r, p = ca.run_efficient_correlation(data.iloc[:, :3], method="pearson")
pd.DataFrame(p)

In [None]:
r_20, p_20 = scipy.stats.pearsonr(data.iloc[:, 0], data.iloc[:, 2])
r_20, p_20
assert r[2, 0] - r_20 < 1e-8
assert p[2, 0] - p_20 < 1e-8

To calculate p-values for the correlation matrix, you can use

In [None]:
res = ca.calculate_pvalue_correlation(
    data.iloc[:, :3].corr(method="pearson").values, n_obs=data.shape[0]
)
pd.DataFrame(res)

## Histogram of the data
We often want to plot the distribution of the data values.
Sometimes you want to use custom bins, e.g. to align multiple histograms.
Here we plot and compute a histogram frequencies with custom bins.

In [None]:
# data = data.drop(columns=["group"])
bins = np.arange(int(data.min(axis=None)), int(data.max(axis=None)) + 1, step=1)

for col in data.columns[:4]:
    ax = data[col].plot.hist(
        bins=bins,
        alpha=0.5,
        xlabel="Value",
        ylabel="Frequency",
    )
ax.title.set_text("Histogram with custom bins")
ax.legend()

In [None]:
hist_series = []
for col in data.columns[:4]:
    s = data[col]
    ret = ea.get_histogram_series(s, bins)
    hist_series.append(ret.rename(col))

hist_df = pd.concat(hist_series, axis=1)
hist_df

Done.