# Compare AlphaKraken observables between different environments
An example notebook to access and compare observables from AlphaKraken output data.




In [None]:
# ruff: noqa: PLR2004, PD002 # magic numbers, inplace

In [None]:
# environment:
# python 3.11
# jupyter==1.1.1
# -r misc/requirements_development.txt

In [None]:
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

## User input
Set these according to your system.

In [None]:
HOME_DIR = Path("/Users/mschwoerer")

# where to find csv export of overview table
input_path_sandbox = HOME_DIR / "Downloads/AlphaKraken_20250717-084639_filtered.csv"
input_path_prod = HOME_DIR / "Downloads/AlphaKraken_20250717-084612_filtered.csv"

## Analysis code

### Read data from AlphaKraken table

In [None]:
def read_overview_data(input_path: Path) -> pd.DataFrame:
    """Read overview data from AlphaKraken export."""
    overview_df = pd.read_csv(input_path)
    overview_df.rename(columns={"_id": "file_name"}, inplace=True)
    overview_df.rename(columns={"Unnamed: 0": "file_name"}, inplace=True)
    overview_df["year_month"] = pd.to_datetime(overview_df["file_created"]).dt.strftime(
        "%Y_%m"
    )
    return overview_df

### Extract data

In [None]:
def extract_observables(all_data: dict) -> pd.DataFrame:
    """Extract observables from AlphaKraken output."""
    tmp_dict = defaultdict(list)
    for file_name, (file_tuple) in all_data.items():
        tmp_dict["file_name"].append(file_name)
        # add your observables here:
        precursors_data = file_tuple[0]
        tmp_dict["intensity_sum"].append(precursors_data["intensity"].sum())

    return pd.DataFrame(tmp_dict)

### Plotting functions

In [None]:
def plot_obs(obs: str, *, symmetric: bool = True) -> None:
    """Plot observables."""
    colors_dict = {
        15: "red",
        16: "green",
    }
    colors = [colors_dict.get(x, "blue") for x in df["settings_version_sandbox"]]

    df.plot(
        kind="scatter",
        x=f"{obs}_prod",
        y=f"{obs}_sandbox",
        color=colors,
        alpha=0.2,
        label=colors_dict,
    )

    if symmetric:
        min_ = min(
            df[f"{obs}_prod"].min(),
            df[f"{obs}_sandbox"].min(),
        )
        max_ = max(
            df[f"{obs}_prod"].max(),
            df[f"{obs}_sandbox"].max(),
        )
        plt.xlim([min_, max_])
        plt.ylim([min_, max_])

    plt.plot(
        [df[f"{obs}_prod"].min(), df[f"{obs}_prod"].max()],
        [df[f"{obs}_prod"].min(), df[f"{obs}_prod"].max()],
        color="grey",
        linestyle="--",
    )
    plt.legend()

    plt.show()

## Analysis: single environment

In [None]:
prod_df = read_overview_data(input_path_prod)

In [None]:
def get_status_details(
    df: pd.DataFrame,
    status: str,
    mask: pd.Series = None,
    blacklist: list[str] = ["NOT_DIA_DATA"],  # noqa: B006
) -> pd.Series:
    """Get status details for a given status."""
    mask1 = df["status"] == status
    mask2 = df["status_details"].apply(
        lambda x: not any(b in str(x) for b in blacklist)
    )
    mask3 = len(df) * [True] if mask is None else mask

    return df[mask1 & mask2 & mask3]["status_details"].value_counts()

In [None]:
# histogram of status_details

_, axs = plt.subplots(1, 2, figsize=(10, 5))

size_mask = prod_df["size_gb"] > 0

get_status_details(
    prod_df, "quanting_failed", (prod_df["settings_version"] == 4) & size_mask
).plot(kind="bar", ax=axs[0])
get_status_details(
    prod_df, "quanting_failed", (prod_df["settings_version"] == 5) & size_mask
).plot(kind="bar", ax=axs[1])

In [None]:
# boxplot of sizes vs status
prod_df["status_group"] = prod_df["status"].apply(
    lambda x: "quanting_failed" if x == "quanting_failed" else "other"
)

prod_df.boxplot(column="size_gb", by="status_group")

## Analysis: compare two environment

In [None]:
sandbox_df = read_overview_data(input_path_sandbox)
prod_df = read_overview_data(input_path_prod)


sandbox_df[pd.isna(sandbox_df["settings_version"])] = 1
prod_df[pd.isna(prod_df["settings_version"])] = 1

sandbox_df = sandbox_df[sandbox_df["settings_version"] > 2]

In [None]:
df = prod_df.merge(sandbox_df, on="file_name", suffixes=("_prod", "_sandbox"))
df.head()

In [None]:
columns_on_top = [
    "proteins",
    "precursors",
    "ms1_accuracy",
    "fwhm_rt",
    "quanting_time_minutes",
]
rest_of_columns = [col for col in prod_df.columns if col not in columns_on_top]
excluded_columns = [
    "file_name",
    "year_month",
    "settings_version",
    "created_at_",
    "updated_at_",
    "file_created",
    "collision_flag",
    "file_info",
    "created_at",
    "gradient_length",
    "original_name",
    "raw_file",
    "_id.1",
]
for obs in [
    col for col in columns_on_top + rest_of_columns if col not in excluded_columns
]:
    try:
        plot_obs(obs)
    except Exception as err:  # noqa: BLE001, PERF203
        print("could not plot", obs, " : ", err)