# Calculate AlphaKraken observables
An example notebook to calculate observables from AlphaKraken output data.

Files will be cached locally in order to reduce network traffic.



In [None]:
# environment:
# python 3.11
# jupyter==1.1.1
# tqdm=4.66.5
# -r misc/requirements_development.txt

In [None]:
import shutil
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

## User input
Set these according to your system.

In [None]:
HOME_DIR = Path("/Users/mschwoerer")

# where to find csv export of overview table
input_path = HOME_DIR / "Downloads/2024-09-10T08-46_export.csv"

# where to find alphakraken network mount
network_path = Path("/Volumes/pool-alphakraken")

# where to store temporary files
local_temp_path = HOME_DIR / "alphakraken_temp"

# add more if you need more files
FILE_NAMES = ["precursors.tsv", "stat.tsv"]

In [None]:
if not network_path.exists():
    raise Warning(
        "Network path not found. Not a problem if the files are already cached."
    )

## Helper code

In [None]:
output_files_path = network_path / "output"

local_temp_path.mkdir(parents=True, exist_ok=True)

In [None]:
def _get_project_for_instrument(row: pd.Series) -> str:
    """Use heuristics to find out project name."""
    # if row["project"]: # TODO: add to kraken table
    #     return row["project"]
    if row["instrument_id"].startswith("tims"):
        return "_FALLBACK_BRUKER"
    return "_FALLBACK"

## Read data from AlphaKraken table

In [None]:
def read_overview_data(input_path: Path) -> pd.DataFrame:
    """Read overview data from AlphaKraken export."""
    overview_df = pd.read_csv(input_path)
    overview_df = overview_df.rename(columns={"_id": "file_name"})
    overview_df = overview_df.rename(columns={"Unnamed: 0": "file_name"})
    overview_df["year_month"] = pd.to_datetime(overview_df["file_created"]).dt.strftime(
        "%Y_%m"
    )
    return overview_df

## Copy data

In [None]:
def _is_file_ready(
    file_name: str, local_path: Path, source_path: Path, single_file_paths: dict
) -> bool:
    """Check if file is already cached, if not copy it. Returns False if file is not available for further processing."""
    all_exist_src = True
    all_exist_dst = True
    if not local_path.exists():
        if not source_path.exists():
            print("not found:", file_name)
            return False

        local_path.mkdir(parents=True, exist_ok=True)

        for src_path, dst_path in single_file_paths.values():
            all_exist_src &= src_path.exists()
            if src_path.exists():
                shutil.copy2(src_path, dst_path)
    else:
        for _, dst_path in single_file_paths.values():
            all_exist_dst &= dst_path.exists()

    if not all_exist_src:
        print("results missing in source:", file_name)
        return False
    if not all_exist_dst:
        print("results missing in dest, removing. Re-run to fix.", file_name)
        shutil.rmtree(local_path)
        return False
    return True


def read_and_cache_output(overview_df: pd.DataFrame) -> dict:
    """Read output files from AlphaKraken and cache them locally."""
    all_data = {}

    for _, row in tqdm(overview_df.iterrows(), total=len(overview_df)):
        file_name = row["file_name"]
        project = _get_project_for_instrument(row)

        rel_path = f"{project}/{row['year_month']}/out_{file_name}"
        source_path = output_files_path / rel_path
        local_path = local_temp_path / rel_path

        paths = {file: (source_path / file, local_path / file) for file in FILE_NAMES}

        if not _is_file_ready(
            file_name,
            local_path,
            source_path,
            paths,
        ):
            continue

        all_data[file_name] = [
            pd.read_csv(paths[file][1], sep="\t") for file in FILE_NAMES
        ]
    return all_data

## Extract data

In [None]:
def extract_observables(all_data: dict) -> pd.DataFrame:
    """Extract observables from AlphaKraken output."""
    tmp_dict = defaultdict(list)
    for file_name, (file_tuple) in all_data.items():
        tmp_dict["file_name"].append(file_name)
        # add your observables here:
        precursors_data = file_tuple[0]
        tmp_dict["intensity_sum"].append(precursors_data["intensity"].sum())

    return pd.DataFrame(tmp_dict)

## Plot data

In [None]:
overview_df = read_overview_data(input_path).head(10)
all_data = read_and_cache_output(overview_df)
observables_df = extract_observables(all_data)
data_df = observables_df.merge(overview_df, on="file_name")

data_df.plot(x="file_created", y="intensity_sum", kind="scatter")
plt.xticks(rotation=45)