In [None]:
!pip uninstall dlomix prospect-dataset -y

In [None]:
!pip install git+https://github.com/wilhelm-lab/dlomix.git@develop

In [None]:
!pip install git+https://github.com/wilhelm-lab/PROSPECT@develop

In [4]:
import prospectdataset as prospect 
data_dir = "./data"
pool_keyword = "third_pool"
record_name = "prospect"

In [None]:
prospect.download_dataset(record = record_name, task = "all",
                          save_directory = data_dir, select_pool = pool_keyword)

In [5]:
import glob
import os
from pathlib import Path

# pick the path of the metadata file, can also be simply copied and pasted from previous cell outout 
#meta_data_filepath = './data/TUM_third_pool_meta_data.parquet'

meta_data_filepath = glob.glob(os.path.join(data_dir, "*"+str(pool_keyword)+"*meta_data.parquet"))[0]
meta_data_filepath

'./data/TUM_third_pool_meta_data.parquet'

In [6]:
import itertools

annotation_dirs = [path for path in glob.glob(os.path.join(data_dir, "*"+str(pool_keyword)+"*")) if os.path.isdir(path)]
annotations_filepaths = [glob.glob(os.path.join(d, "*.parquet")) for d in annotation_dirs]
annotations_filepaths = list(itertools.chain(*annotations_filepaths))
annotations_names = [Path(f).stem for f in annotations_filepaths]

annotations_names, annotations_filepaths

(['TUM_third_pool_1_01_01_annotation',
  'TUM_third_pool_2_01_01_annotation',
  'TUM_third_pool_3_01_01_annotation',
  'TUM_third_pool_4_01_01_annotation',
  'TUM_third_pool_5_01_01_annotation',
  'TUM_third_pool_6_01_01_annotation'],
 ['./data/TUM_third_pool/TUM_third_pool_1_01_01_annotation.parquet',
  './data/TUM_third_pool/TUM_third_pool_2_01_01_annotation.parquet',
  './data/TUM_third_pool/TUM_third_pool_3_01_01_annotation.parquet',
  './data/TUM_third_pool/TUM_third_pool_4_01_01_annotation.parquet',
  './data/TUM_third_pool/TUM_third_pool_5_01_01_annotation.parquet',
  './data/TUM_third_pool/TUM_third_pool_6_01_01_annotation.parquet'])

In [7]:
input_data_dict = {
    "metadata": meta_data_filepath,
    "annotations": {
        pool_keyword: dict(zip(annotations_names, annotations_filepaths))
    },
    "parameters": {
        "target_column_key": "intensities_raw"
    }
}

input_data_dict

{'metadata': './data/TUM_third_pool_meta_data.parquet',
 'annotations': {'third_pool': {'TUM_third_pool_1_01_01_annotation': './data/TUM_third_pool/TUM_third_pool_1_01_01_annotation.parquet',
   'TUM_third_pool_2_01_01_annotation': './data/TUM_third_pool/TUM_third_pool_2_01_01_annotation.parquet',
   'TUM_third_pool_3_01_01_annotation': './data/TUM_third_pool/TUM_third_pool_3_01_01_annotation.parquet',
   'TUM_third_pool_4_01_01_annotation': './data/TUM_third_pool/TUM_third_pool_4_01_01_annotation.parquet',
   'TUM_third_pool_5_01_01_annotation': './data/TUM_third_pool/TUM_third_pool_5_01_01_annotation.parquet',
   'TUM_third_pool_6_01_01_annotation': './data/TUM_third_pool/TUM_third_pool_6_01_01_annotation.parquet'}},
 'parameters': {'target_column_key': 'intensities_raw'}}

In [8]:
# later we can feed the dict directly as a data source, for now we stick to json format

import json
with open("input_config.json", 'w') as fp:
    json.dump(input_data_dict, fp)

In [None]:
from dlomix.data import IntensityDataset
from dlomix.data.feature_extractors import (
    ModificationGainFeature,
    ModificationLocationFeature,
    ModificationLossFeature,
)

BATCH_SIZE = 128
SEQ_LENGTH = 30

int_data = IntensityDataset(
    data_source="input_config.json",
    seq_length=SEQ_LENGTH,
    batch_size=BATCH_SIZE,
    val_ratio=0.15,
    precursor_charge_col="precursor_charge_onehot",
    sequence_col="modified_sequence",
    collision_energy_col="collision_energy_aligned_normed",
    intensities_col="intensities_raw",
    features_to_extract=[
        ModificationLocationFeature(),
        ModificationLossFeature(),
        ModificationGainFeature(),
    ],
    parser="proforma",
    metadata_filtering_criteria = {
        "peptide_length": f"<= {SEQ_LENGTH}",
        "precursor_charge": "<= 6",
        "fragmentation": "== 'HCD'",
#         "mass_analyzer": "== 'FTMS'"
    }
)

2023-07-14 10:58:33.998247: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-14 10:58:34.107662: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-14 10:58:34.752297: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-07-14 10:58:34.752367: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

Optionally Downloading and processing the data...
Annotations directory:  /cmnfs/home/l.mamisashvili/transforming-prosit/notebooks/./data/TUM_third_pool
Metadata filepath:  /cmnfs/home/l.mamisashvili/transforming-prosit/notebooks/./data/TUM_third_pool_meta_data.parquet
Base directory:  /cmnfs/home/l.mamisashvili/transforming-prosit/notebooks
--------------------------------------------------------------------------------
Starting processing and filtering the pool, this may take a while...
--------------------------------------------------------------------------------
Reading metadata file from /cmnfs/home/l.mamisashvili/transforming-prosit/notebooks/./data/TUM_third_pool_meta_data.parquet
Reading and processing annotation files...
Reading file:  /cmnfs/home/l.mamisashvili/transforming-prosit/notebooks/./data/TUM_third_pool/TUM_third_pool_1_01_01_annotation.parquet
Filtering annotation file...
Sorting by fragment_score...
Dropping duplicates...
Sorting by intensity...
Dropping duplicat

In [None]:
"Training examples", BATCH_SIZE * len(int_data.train_data)


In [None]:
"Validation examples", BATCH_SIZE * len(int_data.val_data)
