In [None]:
import pandas as pd
import numpy as np
import os
import re
from scipy.stats import skew
from matplotlib import pyplot as plt
from matplotlib import pyplot as plt, colors
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Read all PROSPECT parquet files

All .parquet files of the PROSPECT dataset need to be downloaded and saved to "../data/" prior to running this notebook.

In [None]:
data_dir = "../data/"
dfs = []
for fn in [os.path.join(data_dir, fn) for fn in os.listdir(data_dir) if fn.endswith('parquet')]:
    dfs.append(pd.read_parquet(fn))

In [None]:
df = pd.concat(dfs)
df = df.reset_index()
df

# Filter

Remove entries with missing values or with a low ANDROMEDA score

In [None]:
# remove rows with missing data
to_remove = pd.isnull(df.raw_file) | pd.isnull(df.scan_number) | \
            pd.isnull(df.indexed_retention_time) | pd.isnull(df.modified_sequence) | \
            pd.isnull(df.andromeda_score)
print("# removed due to missing data:", to_remove.sum())
df = df.loc[~to_remove]

In [None]:
# select most likely sequence assignments
df = df.sort_values('andromeda_score').groupby(['raw_file', 'scan_number']).head(1)

In [None]:
df.andromeda_score.hist(bins=1000)

In [None]:
# remove low-quality sequence assignments
df = df.loc[df.andromeda_score > 70.]

# Identify and replace PTMs

Find unique unimod identifiers of post-translationally modified residues and replace them with a unique single-character identifier.

In [None]:
mod_regex = '(.\[UNIMOD:(.*?)\])'
all_mods = df.modified_sequence.str.findall(mod_regex).explode().unique()
all_mods

In [None]:
single_letter_encoding = {re.escape(unimod):chr(ord('Z') + int(val)) for unimod,val in all_mods[~pd.isnull(all_mods)]}

In [None]:
print(single_letter_encoding)

In [None]:
df['modified_sequence_single_letter'] = df.modified_sequence.replace(single_letter_encoding, regex=True)
df.modified_sequence_single_letter

# Sample without replacement

For each unique modified sequences, randomly sample up to 10 dataset entries without replacement to serve as samples in the "sel10" dataset.

In [None]:
n = 10
sel_str = f'sel{n}'
df[sel_str] = False
# shuffle df
df = df.sample(frac=1, random_state=42)
df.loc[df.groupby('modified_sequence_single_letter').head(n).index, sel_str] = True

# Compute median values & statistics

For all unique modified sequences, compute median indexed retention times and other statistics over all dataset entires with this sequence.

In [None]:
grpd = df.groupby('modified_sequence_single_letter')[['indexed_retention_time']]

In [None]:
d = grpd.agg(
    cnt=('indexed_retention_time', 'count'),
    std=('indexed_retention_time','std'),
    median=('indexed_retention_time','median'),
    mean=('indexed_retention_time','mean'),
    min=('indexed_retention_time','min'),
    max=('indexed_retention_time','max'),
    skew=('indexed_retention_time',lambda X: skew(X)))

In [None]:
d['npstd'] = grpd.agg(np.std, ddof=0)

In [None]:
del grpd # clear some memory

In [None]:
# std. dev. vs. counts per sequence plot
fig,ax = plt.subplots(figsize=(9,7))
hh = ax.hist2d(d['cnt'], d['std'], range=((0,2000),(0,25)), bins=500, norm=colors.LogNorm())
fig.colorbar(hh[3], ax=ax)
plt.show()

In [None]:
# 2D-Histogram of group counts vs. median iRT
fig,ax = plt.subplots(figsize=(9,7))
hh = ax.hist2d(d['median'], d['cnt'], range=((-20,130), (0,2000)), bins=500, norm=colors.LogNorm())
fig.colorbar(hh[3], ax=ax)
plt.show()

# Splitting & Exporting datasets

6-fold Cross-Validation split (only 5 used later on) with fixed holdout set of the sel10 and median datasets. The two dataset-types are split identically (based on unique modified sequence).

In [None]:
# define 'sets' later used for composing cross-validation splits
seqs_remaining, seqs_holdout = train_test_split(d.index, test_size=0.15, random_state=42) # 15 % holdout
cv = 6
partitions = []
calibs = []
for i in range(cv):
    seqs_remaining, seqs_part = train_test_split(seqs_remaining, test_size=min(1.0, (0.85/cv - 0.02) / (0.85 - i*0.85/cv)), random_state=42)
    try: 
        seqs_remaining,seqs_calib = train_test_split(seqs_remaining, test_size=min(1.0, 0.02 / (0.85 - i*0.85/cv - (0.85/cv - 0.02))), random_state=42)
    except:
        seqs_calib = seqs_remaining
    calibs.append(seqs_calib)
    partitions.append(seqs_part)

In [None]:
d['set'] = ''
for i in range(cv):
    d.loc[partitions[i], 'set'] = f'cv{i}'
    d.loc[calibs[i],     'set'] = f'cal{i}'
d.loc[seqs_holdout,      'set'] = 'holdout'

In [None]:
for split in ['holdout'] + [f'cv{i}' for i in range(cv)] + [f'cal{i}' for i in range(cv)]:
    print("split {:<13} : {:>6} {:>5.2f} %".format(split,len(d.loc[d.set == split]), len(d.loc[d.set == split]) / len(d) * 100.))

In [None]:
df['set'] = ''
for i in range(cv):
    df.loc[df.modified_sequence_single_letter.isin(partitions[i]), 'set'] = f'cv{i}'
    df.loc[df.modified_sequence_single_letter.isin(calibs[i]), 'set'] = f'cal{i}'
df.loc[df.modified_sequence_single_letter.isin(seqs_holdout), 'set'] = 'holdout'

In [None]:
d = d.sort_values('set').reset_index(drop=False)

In [None]:
df = df.sort_values('set').reset_index(drop=True)

In [None]:
# compose splits and export

out_dir = "../data/"
# export all data in one csv (as a savepoint and reference)
df.to_csv(os.path.join(out_dir, 'PROSPECT_all_cv.csv'))

In [None]:
# compose and export "sel10" CV splits
cols = ['modified_sequence_single_letter', 'indexed_retention_time', 'andromeda_score']
for i in range(cv):
    training_sets = [f'cv{j}' for j in range(cv) if j != i] + [f'cal{j}' for j in range(cv) if j != i]
    validation_sets = [f'cv{i}', f'cal{i}']
    calibration_set = f'cal{i+1}' if i+1 < cv else 'cal0'
    training_sets.remove(calibration_set)
    df.loc[df.sel10 & (df.set.isin(training_sets))][cols].to_csv(os.path.join(out_dir, f'PROSPECT_sel10_training{i}.csv'))
    df.loc[df.sel10 & (df.set.isin(validation_sets))][cols].to_csv(os.path.join(out_dir, f'PROSPECT_sel10_validation{i}.csv'))
    df.loc[df.sel10 & (df.set == calibration_set)][cols].to_csv(os.path.join(out_dir, f'PROSPECT_sel10_calibration{i}.csv'))
df.loc[df.sel10 & (df.set == 'holdout')][cols].to_csv(os.path.join(out_dir, f'PROSPECT_sel10_holdout_cv.csv'))

In [None]:
# compose and export "median" CV splits
for i in range(cv):
    training_sets = [f'cv{j}' for j in range(cv) if j != i] + [f'cal{j}' for j in range(cv) if j != i]
    validation_sets = [f'cv{i}', f'cal{i}']
    calibration_set = f'cal{i+1}' if i+1 < cv else 'cal0'
    training_sets.remove(calibration_set)
    d.loc[(d.set.isin(training_sets))].to_csv(os.path.join(out_dir, f'PROSPECT_median_training{i}.csv'))
    d.loc[(d.set.isin(validation_sets))].to_csv(os.path.join(out_dir, f'PROSPECT_median_validation{i}.csv'))
    d.loc[(d.set == calibration_set)].to_csv(os.path.join(out_dir, f'PROSPECT_median_calibration{i}.csv'))
d.loc[(d.set == 'holdout')].to_csv(os.path.join(out_dir, f'PROSPECT_median_holdout_cv.csv'))