In [None]:
!pip uninstall dlomix prospect-dataset -y

In [None]:
!pip install git+https://github.com/wilhelm-lab/dlomix.git@develop

In [None]:
!pip install git+https://github.com/wilhelm-lab/PROSPECT@develop

In [None]:
import prospectdataset as prospect 
data_dir = "./data"
pool_keyword = "third_pool"
record_name = "prospect"

In [None]:
prospect.download_dataset(record = record_name, task = "all",
                          save_directory = data_dir, select_pool = pool_keyword)

In [None]:
import glob
import os
from pathlib import Path

# pick the path of the metadata file, can also be simply copied and pasted from previous cell outout 
#meta_data_filepath = './data/TUM_third_pool_meta_data.parquet'

meta_data_filepath = glob.glob(os.path.join(data_dir, "*"+str(pool_keyword)+"*meta_data.parquet"))[0]
meta_data_filepath

In [None]:
import itertools

annotation_dirs = [path for path in glob.glob(os.path.join(data_dir, "*"+str(pool_keyword)+"*")) if os.path.isdir(path)]
annotations_filepaths = [glob.glob(os.path.join(d, "*.parquet")) for d in annotation_dirs]
annotations_filepaths = list(itertools.chain(*annotations_filepaths))
annotations_names = [Path(f).stem for f in annotations_filepaths]

annotations_names, annotations_filepaths

In [None]:
input_data_dict = {
    "metadata": meta_data_filepath,
    "annotations": {
        pool_keyword: dict(zip(annotations_names, annotations_filepaths))
    },
    "parameters": {
        "target_column_key": "intensities_raw"
    }
}

input_data_dict

In [None]:
# later we can feed the dict directly as a data source, for now we stick to json format

import json
with open("input_config.json", 'w') as fp:
    json.dump(input_data_dict, fp)

In [None]:
from dlomix.data import IntensityDataset
from dlomix.data.feature_extractors import (
    ModificationGainFeature,
    ModificationLocationFeature,
    ModificationLossFeature,
)

BATCH_SIZE = 128
SEQ_LENGTH = 30

int_data = IntensityDataset(
    data_source="input_config.json",
    seq_length=SEQ_LENGTH,
    batch_size=BATCH_SIZE,
    val_ratio=0.15,
    precursor_charge_col="precursor_charge_onehot",
    sequence_col="modified_sequence",
    collision_energy_col="collision_energy_aligned_normed",
    intensities_col="intensities_raw",
    features_to_extract=[
        ModificationLocationFeature(),
        ModificationLossFeature(),
        ModificationGainFeature(),
    ],
    parser="proforma",
    metadata_filtering_criteria = {
        "peptide_length": f"<= {SEQ_LENGTH}",
        "precursor_charge": "<= 6",
        "fragmentation": "== 'HCD'",
        "mass_analyzer": "== 'FTMS'"
    }
)

In [None]:
"Training examples", BATCH_SIZE * len(int_data.train_data)


In [None]:
"Validation examples", BATCH_SIZE * len(int_data.val_data)
