In [None]:
import os
from IPython.display import JSON, display
from typing import Optional, List, Dict, Any

from md_python import MDClient, Experiment
from md_python import ExperimentDesign, SampleMetadata

from md_python import PairwiseComparisonDataset


client = MDClient()
health = client.health.check()

# Run and minimally test
assert isinstance(health, dict)
display(JSON(health, expanded=True))

## Metadata files

Metadata files must be stored in a local directory.
For LFQ experiments, the experiment design and sample metadata can be combined into a single file, provided that the columns "filename", "sample_name", and "condition" are included.

The files_to_upload variable is a list of files that have already been uploaded to s3_bucket/s3_key/ and are expected to be picked up by the API. These files typically include the Spectronaut output.

In [None]:
metadata_path = "/Users/giuseppeinfusini/wd/Data_for_upload_md/MD-format/Small_data"
experiment_design_filename = "experiment_design_COMBINED.csv"
sample_metadata_filename = "experiment_design_COMBINED.csv"

# S3 bucket files
files_to_upload = [
    "proteomics_proteins_COMBINED.tsv",
    "proteomics_peptides_COMBINED.tsv",
]

In [None]:
def load_experiment_design(
    dir_path: str, filename: str, delimiter: str = ","
) -> ExperimentDesign:
    """Load ExperimentDesign from a CSV located at dir_path/filename."""
    return ExperimentDesign.from_csv(
        os.path.join(dir_path, filename), delimiter=delimiter
    )


def load_sample_metadata(
    dir_path: str, filename: str, delimiter: str = ","
) -> SampleMetadata:
    return SampleMetadata.from_csv(
        os.path.join(dir_path, filename), delimiter=delimiter
    )


exp_design = load_experiment_design(metadata_path, experiment_design_filename)
sample_metadata = load_sample_metadata(metadata_path, sample_metadata_filename)
assert isinstance(exp_design, ExperimentDesign)
print(exp_design)  # brief preview via __str__
assert isinstance(sample_metadata, SampleMetadata)
print(sample_metadata)  # brief preview via __str__

## Create experiment

In [None]:
exp = Experiment(
    name="test_api_client_04",
    source="md_format",
    labelling_method="lfq",
    s3_bucket="md-development-test-data",
    s3_prefix="small_drc_api_test/",
    filenames=["proteomics_proteins_COMBINED.tsv", "proteomics_peptides_COMBINED.tsv"],
    experiment_design=exp_design,
    sample_metadata=sample_metadata,
)

# experiment_id = client.experiments.create(exp)
# print(experiment_id)
# assert isinstance(experiment_id, str) and len(experiment_id) > 0

In [None]:
# temp for developpment
experiment_id = "cbd62af2-19da-476d-8bb1-bda6b3823c73"

## Wait for experiment to complete

In [None]:
completed_experiment = client.experiments.wait_until_complete(experiment_id)

## Pairwise comparison


#### Find the initial intensity dataset.

In [None]:
dataset = client.datasets.find_initial_dataset(experiment_id)
dataset

# MISSING STEP of imputation

#### Define pairwise comparisons by selecting a control.


In [None]:
comparisons = sample_metadata.pairwise_vs_control(
    column="condition", control="md00001_a"
)
comparisons

In [None]:
pw = PairwiseComparisonDataset(
    input_dataset_ids=[str(dataset.id)],
    dataset_name="Pairwise test full-02",
    sample_metadata=sample_metadata,
    condition_column="condition",
    condition_comparisons=comparisons,
)
dataset_id = pw.run(client)

In [None]:
dataset_id

In [None]:
state = client.datasets.wait_until_complete(
    experiment_id=experiment_id,
    dataset_id=dataset_id,
)
state

In [None]:
from typing import List, Dict, Any
from uuid import UUID
from md_python import Dataset
from md_python.models import SampleMetadata


def create_pairwise_comparison_dataset(
    input_dataset_ids: List[str],
    dataset_name: str,
    sample_metadata: SampleMetadata,
    condition_column: str,
    condition_comparisons: List[List[str]],
    filter_valid_values_logic: str = "at least one condition",  # ["all conditions", "at least one condition", "full experiment"]
    filter_values_criteria: Dict[str, Any] = {
        "method": "percentage",
        "filter_threshold_percentage": 0.5,
    },  # 'count', 'filter_threshold_count'
    fit_separate_models: bool = True,
    limma_trend: bool = True,
    robust_empirical_bayes: bool = True,
    control_variables: List[Dict[str, str]] = None,
    entity_type: str = "protein",
    job_slug: str = "pairwise_comparison",
) -> Dataset:
    return Dataset(
        input_dataset_ids=[UUID(x) for x in input_dataset_ids],
        name=dataset_name,
        job_slug=job_slug,
        job_run_params={
            "condition_column": condition_column,
            "condition_comparisons": {
                "condition_comparison_pairs": condition_comparisons
            },
            "experiment_design": sample_metadata.to_columns(),
            "filter_valid_values_logic": filter_valid_values_logic,
            "filter_values_criteria": filter_values_criteria,
            "fit_separate_models": fit_separate_models,
            "limma_trend": limma_trend,
            "robust_empirical_bayes": robust_empirical_bayes,
            "control_variables": control_variables,
            "entity_type": entity_type,
        },
    )

In [None]:
pw_dataset = create_pairwise_comparison_dataset(
    input_dataset_ids=[str(dataset.id)],
    dataset_name="Pairwise test",
    sample_metadata=sample_metadata,
    condition_column="condition",
    condition_comparisons=[["md00001_a", "md00001_b"], ["md00001_a", "md00003_a"]],
    # optional params keep defaults...
)

dataset_id = client.datasets.create(pw_dataset)

In [None]:
# Temp for develpment
dataset_id = "3b19ae4b-282e-4eb0-9d2b-ec1c0c7a8084"

In [None]:
# import time
# from typing import Dict, Any, Optional
# from md_python import MDClient


def wait_for_dataset(
    client: MDClient,
    experiment_id: str,
    dataset_id: str,
    poll_s: int = 5,
    timeout_s: int = 1800,
) -> Dict[str, Any]:
    """Polls list_by_experiment until the dataset.state is terminal."""
    end = time.monotonic() + timeout_s
    last: Optional[str] = None
    while time.monotonic() < end:
        dds = client.datasets.list_by_experiment(experiment_id=experiment_id)
        ds = next((d for d in dds if str(d.id) == dataset_id), None)
        if ds:
            state = ds.state  # use dataset.state key
            if state != last:
                print(f"state={state}")
                last = state
            if state in {"COMPLETED", "FAILED", "ERROR", "CANCELLED"}:
                return ds
        else:
            if last is None:
                print("waiting for dataset to appear...")
        time.sleep(poll_s)
    raise TimeoutError(f"Dataset {dataset_id} not terminal within {timeout_s}s")


# tiny test
# result = wait_for_dataset(client, experiment_id, dataset_id)
# assert result.get("state") in {"COMPLETED", "FAILED", "ERROR", "CANCELLED"}

In [None]:
result = wait_for_dataset(client, experiment_id, dataset_id)

In [None]:
result